Linux virtualization list

Linux virtualization list
 help / color / mirror / Atom feed

* Re: [PATCH] VSOCK: Send reset control packet when socket is partially bound
From: David Miller @ 2018-12-17 19:15 UTC (permalink / raw)
  To: jhansen; +Cc: pv-drivers, netdev, gregkh, linux-kernel, virtualization
In-Reply-To: <1544607539-52634-1-git-send-email-jhansen@vmware.com>

From: Jorgen Hansen <jhansen@vmware.com>
Date: Wed, 12 Dec 2018 01:38:59 -0800

>  static int vmci_transport_send_reset_bh(struct sockaddr_vm *dst,
> @@ -312,12 +328,29 @@ static int vmci_transport_send_reset_bh(struct sockaddr_vm *dst,
>  static int vmci_transport_send_reset(struct sock *sk,
>  				     struct vmci_transport_packet *pkt)
>  {
> +	struct sockaddr_vm dst;
> +	struct sockaddr_vm *dst_ptr;
> +	struct vsock_sock *vsk;
> +

Please order local variables from longest to shortest line.

Thank you.

^ permalink raw reply

* [PATCH v3 04/12] Revert "x86/paravirt: Work around GCC inlining bugs when compiling paravirt ops"
From: Masahiro Yamada @ 2018-12-17 16:03 UTC (permalink / raw)
  To: x86, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	H . Peter Anvin
  Cc: Juergen Gross, Richard Biener, Kees Cook, Segher Boessenkool,
	Peter Zijlstra, linux-kernel, virtualization, Masahiro Yamada,
	Nadav Amit, Josh Poimboeuf, Alok Kataria, Linus Torvalds
In-Reply-To: <1545062607-8599-1-git-send-email-yamada.masahiro@socionext.com>

This reverts commit 494b5168f2de009eb80f198f668da374295098dd.

The in-kernel workarounds will be replaced with GCC's new
"asm inline" syntax.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---

 arch/x86/include/asm/paravirt_types.h | 56 ++++++++++++++++++-----------------
 arch/x86/kernel/macros.S              |  1 -
 2 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 26942ad..488c596 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -348,11 +348,23 @@ extern struct paravirt_patch_template pv_ops;
 #define paravirt_clobber(clobber)		\
 	[paravirt_clobber] "i" (clobber)
 
+/*
+ * Generate some code, and mark it as patchable by the
+ * apply_paravirt() alternate instruction patcher.
+ */
+#define _paravirt_alt(insn_string, type, clobber)	\
+	"771:\n\t" insn_string "\n" "772:\n"		\
+	".pushsection .parainstructions,\"a\"\n"	\
+	_ASM_ALIGN "\n"					\
+	_ASM_PTR " 771b\n"				\
+	"  .byte " type "\n"				\
+	"  .byte 772b-771b\n"				\
+	"  .short " clobber "\n"			\
+	".popsection\n"
+
 /* Generate patchable code, with the default asm parameters. */
-#define paravirt_call							\
-	"PARAVIRT_CALL type=\"%c[paravirt_typenum]\""			\
-	" clobber=\"%c[paravirt_clobber]\""				\
-	" pv_opptr=\"%c[paravirt_opptr]\";"
+#define paravirt_alt(insn_string)					\
+	_paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
 
 /* Simple instruction patching code. */
 #define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
@@ -373,6 +385,16 @@ unsigned native_patch(u8 type, void *ibuf, unsigned long addr, unsigned len);
 int paravirt_disable_iospace(void);
 
 /*
+ * This generates an indirect call based on the operation type number.
+ * The type number, computed in PARAVIRT_PATCH, is derived from the
+ * offset into the paravirt_patch_template structure, and can therefore be
+ * freely converted back into a structure offset.
+ */
+#define PARAVIRT_CALL					\
+	ANNOTATE_RETPOLINE_SAFE				\
+	"call *%c[paravirt_opptr];"
+
+/*
  * These macros are intended to wrap calls through one of the paravirt
  * ops structs, so that they can be later identified and patched at
  * runtime.
@@ -509,7 +531,7 @@ int paravirt_disable_iospace(void);
 		/* since this condition will never hold */		\
 		if (sizeof(rettype) > sizeof(unsigned long)) {		\
 			asm volatile(pre				\
-				     paravirt_call			\
+				     paravirt_alt(PARAVIRT_CALL)	\
 				     post				\
 				     : call_clbr, ASM_CALL_CONSTRAINT	\
 				     : paravirt_type(op),		\
@@ -519,7 +541,7 @@ int paravirt_disable_iospace(void);
 			__ret = (rettype)((((u64)__edx) << 32) | __eax); \
 		} else {						\
 			asm volatile(pre				\
-				     paravirt_call			\
+				     paravirt_alt(PARAVIRT_CALL)	\
 				     post				\
 				     : call_clbr, ASM_CALL_CONSTRAINT	\
 				     : paravirt_type(op),		\
@@ -546,7 +568,7 @@ int paravirt_disable_iospace(void);
 		PVOP_VCALL_ARGS;					\
 		PVOP_TEST_NULL(op);					\
 		asm volatile(pre					\
-			     paravirt_call				\
+			     paravirt_alt(PARAVIRT_CALL)		\
 			     post					\
 			     : call_clbr, ASM_CALL_CONSTRAINT		\
 			     : paravirt_type(op),			\
@@ -664,26 +686,6 @@ struct paravirt_patch_site {
 extern struct paravirt_patch_site __parainstructions[],
 	__parainstructions_end[];
 
-#else	/* __ASSEMBLY__ */
-
-/*
- * This generates an indirect call based on the operation type number.
- * The type number, computed in PARAVIRT_PATCH, is derived from the
- * offset into the paravirt_patch_template structure, and can therefore be
- * freely converted back into a structure offset.
- */
-.macro PARAVIRT_CALL type:req clobber:req pv_opptr:req
-771:	ANNOTATE_RETPOLINE_SAFE
-	call *\pv_opptr
-772:	.pushsection .parainstructions,"a"
-	_ASM_ALIGN
-	_ASM_PTR 771b
-	.byte \type
-	.byte 772b-771b
-	.short \clobber
-	.popsection
-.endm
-
 #endif	/* __ASSEMBLY__ */
 
 #endif	/* _ASM_X86_PARAVIRT_TYPES_H */
diff --git a/arch/x86/kernel/macros.S b/arch/x86/kernel/macros.S
index 71d8b71..66ccb8e 100644
--- a/arch/x86/kernel/macros.S
+++ b/arch/x86/kernel/macros.S
@@ -10,4 +10,3 @@
 #include <asm/refcount.h>
 #include <asm/alternative-asm.h>
 #include <asm/bug.h>
-#include <asm/paravirt.h>
-- 
2.7.4

^ permalink raw reply related

* [PATCH v3 00/12] x86, kbuild: revert macrofying inline assembly code
From: Masahiro Yamada @ 2018-12-17 16:03 UTC (permalink / raw)
  To: x86, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	H . Peter Anvin
  Cc: Peter Zijlstra, Alexei Starovoitov, virtualization,
	Masahiro Yamada, Nadav Amit, Jan Beulich, Andrey Ryabinin,
	linux-arch, Arnd Bergmann, linux-kbuild, linux-sparse,
	Yonghong Song, Alexey Dobriyan, Kees Cook, Segher Boessenkool,
	Jann Horn, Arnaldo Carvalho de Melo, Josh Poimboeuf, Alok Kataria,
	Juergen Gross, Michal Marek, Richard Biener, Ard Biesheuvel,
	linux-kernel, David

This series reverts the in-kernel workarounds for inlining issues.

The commit description of 77b0bf55bc67 mentioned
"We also hope that GCC will eventually get fixed,..."

Now, GCC provides a solution.

https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html
explains the new "asm inline" syntax.

The performance issue will be eventually solved.

[About Code cleanups]

I know Nadam Amit is opposed to the full revert.
He also claims his motivation for macrofying was not only
performance, but also cleanups.

IIUC, the criticism addresses the code duplication between C and ASM.

If so, I'd like to suggest a different approach for cleanups.
Please see the last 3 patches.
IMHO, preprocessor approach is more straight-forward, and readable.
Basically, this idea should work because it is what we already do for
__ASM_FORM() etc.

[Quick Test of "asm inline" of GCC 9]

If you want to try "asm inline" feature, the patch is available:
https://lore.kernel.org/patchwork/patch/1024590/

The number of symbols for arch/x86/configs/x86_64_defconfig:

                            nr_symbols
  [1]    v4.20-rc7       :   96502
  [2]    [1]+full revert :   96705   (+203)
  [3]    [2]+"asm inline":   96568   (-137)

[3]: apply my patch, then replace "asm" -> "asm_inline"
    for _BUG_FLAGS(), refcount_add(), refcount_inc(), refcount_dec(),
        annotate_reachable(), annotate_unreachable()


Changes in v3:
  - Split into per-commit revert (per Nadav Amit)
  - Add some cleanups with preprocessor approach

Changes in v2:
  - Revive clean-ups made by 5bdcd510c2ac (per Peter Zijlstra)
  - Fix commit quoting style (per Peter Zijlstra)

Masahiro Yamada (12):
  Revert "x86/jump-labels: Macrofy inline assembly code to work around
    GCC inlining bugs"
  Revert "x86/cpufeature: Macrofy inline assembly code to work around
    GCC inlining bugs"
  Revert "x86/extable: Macrofy inline assembly code to work around GCC
    inlining bugs"
  Revert "x86/paravirt: Work around GCC inlining bugs when compiling
    paravirt ops"
  Revert "x86/bug: Macrofy the BUG table section handling, to work
    around GCC inlining bugs"
  Revert "x86/alternatives: Macrofy lock prefixes to work around GCC
    inlining bugs"
  Revert "x86/refcount: Work around GCC inlining bug"
  Revert "x86/objtool: Use asm macros to work around GCC inlining bugs"
  Revert "kbuild/Makefile: Prepare for using macros in inline assembly
    code to work around asm() related GCC inlining bugs"
  linux/linkage: add ASM() macro to reduce duplication between C/ASM
    code
  x86/alternatives: consolidate LOCK_PREFIX macro
  x86/asm: consolidate ASM_EXTABLE_* macros

 Makefile                                  |  9 +--
 arch/x86/Makefile                         |  7 ---
 arch/x86/include/asm/alternative-asm.h    | 22 +------
 arch/x86/include/asm/alternative-common.h | 47 +++++++++++++++
 arch/x86/include/asm/alternative.h        | 30 +---------
 arch/x86/include/asm/asm.h                | 46 +++++----------
 arch/x86/include/asm/bug.h                | 98 +++++++++++++------------------
 arch/x86/include/asm/cpufeature.h         | 82 +++++++++++---------------
 arch/x86/include/asm/jump_label.h         | 22 +++++--
 arch/x86/include/asm/paravirt_types.h     | 56 +++++++++---------
 arch/x86/include/asm/refcount.h           | 81 +++++++++++--------------
 arch/x86/kernel/macros.S                  | 16 -----
 include/asm-generic/bug.h                 |  8 +--
 include/linux/compiler.h                  | 56 ++++--------------
 include/linux/linkage.h                   |  8 +++
 scripts/Kbuild.include                    |  4 +-
 scripts/mod/Makefile                      |  2 -
 17 files changed, 249 insertions(+), 345 deletions(-)
 create mode 100644 arch/x86/include/asm/alternative-common.h
 delete mode 100644 arch/x86/kernel/macros.S

-- 
2.7.4

^ permalink raw reply

* Re: [PATCH]  Export mm_update_next_owner function for vhost-net
From: Christoph Hellwig @ 2018-12-17 11:32 UTC (permalink / raw)
  To: gchen.guomin
  Cc: kvm, Michael S. Tsirkin, netdev, linux-kernel, Dominik Brodowski,
	virtualization, Sudip Mukherjee, Luis R. Rodriguez,
	Eric W. Biederman, Andrew Morton, guominchen
In-Reply-To: <1544676445-14897-1-git-send-email-gchen.guomin@gmail.com>

This seems like an issue all the unuse_mm users (at least those
outside of swapfile.c) have, so it should be solved in the core.

Bonus points for moving the set_fs magic into use_mm()..

^ permalink raw reply

* Re: [PATCH v2] x86, kbuild: revert macrofying inline assembly code
From: Masahiro Yamada @ 2018-12-17  2:54 UTC (permalink / raw)
  To: Nadav Amit
  Cc: linux-arch, Juergen Gross, Michal Marek, Richard Biener,
	Arnd Bergmann, Segher Boessenkool, linux-kbuild@vger.kernel.org,
	X86 ML, Linux Kernel Mailing List,
	virtualization@lists.linux-foundation.org,
	linux-sparse@vger.kernel.org, Ingo Molnar, Borislav Petkov,
	H . Peter Anvin, Thomas Gleixner, Alok Kataria,
	Luc Van Oostenryck
In-Reply-To: <9691ECB6-D659-4E88-B8AF-4947B9431AF1@vmware.com>

On Sun, Dec 16, 2018 at 12:29 PM Nadav Amit <namit@vmware.com> wrote:
>
> > On Dec 15, 2018, at 6:50 PM, Masahiro Yamada <yamada.masahiro@socionext.com> wrote:
> >
> > Revert the following 9 commits:
> >
> > [1] 5bdcd510c2ac ("x86/jump-labels: Macrofy inline assembly code to
> >    work around GCC inlining bugs")
> >
> >    This was partially reverted because it made good cleanups
> >    irrespective of the inlining issue; the error message is still
> >    unneeded, and the conversion to STATIC_BRANCH_{NOP,JUMP} should
> >    be kept.
> >
> > [2] d5a581d84ae6 ("x86/cpufeature: Macrofy inline assembly code to
> >    work around GCC inlining bugs")
> >
> > [3] 0474d5d9d2f7 ("x86/extable: Macrofy inline assembly code to work
> >    around GCC inlining bugs")
> >
> > [4] 494b5168f2de ("x86/paravirt: Work around GCC inlining bugs when
> >    compiling paravirt ops")
> >
> > [5] f81f8ad56fd1 ("x86/bug: Macrofy the BUG table section handling,
> >    to work around GCC inlining bugs")
> >
> > [6] 77f48ec28e4c ("x86/alternatives: Macrofy lock prefixes to work
> >   around GCC inlining bugs")
> >
> > [7] 9e1725b41059 ("x86/refcount: Work around GCC inlining bug")
> >
> >    Resolved conflicts in arch/x86/include/asm/refcount.h caused by
> >    288e4521f0f6 ("x86/asm: 'Simplify' GEN_*_RMWcc() macros").
> >
> > [8] c06c4d809051 ("x86/objtool: Use asm macros to work around GCC
> >    inlining bugs")
> >
> > [9] 77b0bf55bc67 ("kbuild/Makefile: Prepare for using macros in inline
> >    assembly code to work around asm() related GCC inlining bugs")
> >
> > A few days after those commits applied, discussion started to solve
> > the issue more elegantly with the help of compiler:
> >
> >  https://na01.safelinks.protection.outlook.com/?url=https%3A%2F%2Flkml.org%2Flkml%2F2018%2F10%2F7%2F92&amp;data=02%7C01%7Cnamit%40vmware.com%7Ce893ce88065e4c59236308d663019424%7Cb39138ca3cee4b4aa4d6cd83d9dd62f0%7C0%7C0%7C636805255787607178&amp;sdata=miiUndmPfGNKvrzD5mttC1%2Bn6rNaoIFebjZOAkBr24Y%3D&amp;reserved=0
> >
> > The new syntax "asm inline" was implemented by Segher Boessenkool, and
> > now queued up for GCC 9. (People were positive even for back-porting it
> > to older compilers).
> >
> > Since the in-kernel workarounds merged, some issues have been reported:
> > breakage of building with distcc/icecc, breakage of distro packages for
> > module building. (More fundamentally, we cannot build external modules
> > after 'make clean'.)
> >
> > I do not want to mess up the build system any more.
> >
> > Given that this issue will be solved in a cleaner way sooner or later,
> > let's revert the in-kernel workarounds, and wait for GCC 9.
> >
> > Reported-by: Logan Gunthorpe <logang@deltatee.com> # distcc
> > Reported-by: Sedat Dilek <sedat.dilek@gmail.com> # deb/rpm package
>
> It is customary to cc those who report an issue.

OK.

> The distcc issue has already been resolved both in distcc

Precisely, the fix-up was submitted,
but not pulled yet as of writing.
https://github.com/distcc/distcc/pull/313


> and in the patches
> I’ve sent: https://lkml.org/lkml/2018/11/15/467 .

I was scared by this ugly fix-up, so I rejected it.



> So I cannot understand why
> it is mentioned as a motivation.
>
> It sounds that the external modules can easily be resolved. Can you please
> provide a link for the bug report?


https://www.spinics.net/lists/linux-kbuild/msg20037.html

We can fix it under circumstances "we can do anything"
although I am scared by endless Makefile hacks.



> Please regard my comments regarding v1.

I will try my best, although I felt some of your requests were too much.
I am not an x86 developer.


I posted this so people can play with 'asm inline'
https://lore.kernel.org/patchwork/patch/1024590/

You can confirm vmlinux size is increased,
and some symbols disappears.



> I must admit that I’m very surprised
> that you don’t like the patches since you ack’d the original patch-set


I think ack and "I like it" are different.

There are situations where we had to accept something reluctantly.


Without my ack, your patch series would not have been
merged via x86 tree.

There was no other solution at that time.
Also, I could not predict potential problems, which turned out later.

So I let it go.

It would have been better if
the following discussion had stared earlier.
https://lkml.org/lkml/2018/10/7/92


Now, we got a much cleaner solution.

I believe we should replace the workarounds with it.



> (and
> actually assisted me in changing the Makefile).

You were clearly breaking the build system.
So, I provided a less ugly solution.

Again, the in-kernel hack was the only solution at that time,
but the situation has changed then.



-- 
Best Regards
Masahiro Yamada
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next 0/3] vhost: accelerate metadata access through vmap()
From: Michael S. Tsirkin @ 2018-12-16 19:57 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20181215.114308.647436101869587689.davem@davemloft.net>

On Sat, Dec 15, 2018 at 11:43:08AM -0800, David Miller wrote:
> From: Jason Wang <jasowang@redhat.com>
> Date: Fri, 14 Dec 2018 12:29:54 +0800
> 
> > 
> > On 2018/12/14 上午4:12, Michael S. Tsirkin wrote:
> >> On Thu, Dec 13, 2018 at 06:10:19PM +0800, Jason Wang wrote:
> >>> Hi:
> >>>
> >>> This series tries to access virtqueue metadata through kernel virtual
> >>> address instead of copy_user() friends since they had too much
> >>> overheads like checks, spec barriers or even hardware feature
> >>> toggling.
> >>>
> >>> Test shows about 24% improvement on TX PPS. It should benefit other
> >>> cases as well.
> >>>
> >>> Please review
> >> I think the idea of speeding up userspace access is a good one.
> >> However I think that moving all checks to start is way too aggressive.
> > 
> > 
> > So did packet and AF_XDP. Anyway, sharing address space and access
> > them directly is the fastest way. Performance is the major
> > consideration for people to choose backend. Compare to userspace
> > implementation, vhost does not have security advantages at any
> > level. If vhost is still slow, people will start to develop backends
> > based on e.g AF_XDP.
> 
> Exactly, this is precisely how this kind of problem should be solved.
> 
> Michael, I strongly support the approach Jason is taking here, and I
> would like to ask you to seriously reconsider your objections.
> 
> Thank you.

Okay. Won't be the first time I'm wrong.

Let's say we ignore security aspects, but we need to make sure the
following all keep working (broken with this revision):
- file backed memory (I didn't see where we mark memory dirty -
  if we don't we get guest memory corruption on close, if we do
  then host crash as https://lwn.net/Articles/774411/ seems to apply here?)
- THP
- auto-NUMA

Because vhost isn't like AF_XDP where you can just tell people "use
hugetlbfs" and "data is removed on close" - people are using it in lots
of configurations with guest memory shared between rings and unrelated
data.

Jason, thoughts on these?

-- 
MST
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH] vhost: return EINVAL if iovecs size does not match the message size
From: Michael S. Tsirkin @ 2018-12-16 19:41 UTC (permalink / raw)
  To: David Miller
  Cc: kvm, netdev, linux-kernel, virtualization, khorenko, ptikhomirov
In-Reply-To: <20181215.114611.1702968159223320911.davem@davemloft.net>

On Sat, Dec 15, 2018 at 11:46:11AM -0800, David Miller wrote:
> From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> Date: Thu, 13 Dec 2018 17:53:50 +0300
> 
> > We've failed to copy and process vhost_iotlb_msg so let userspace at
> > least know about it. For instance before these patch the code below runs
> > without any error:
>  ...
> > Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> 
> Michael, will you be taking this in via your tree?
> 
> Thanks.

Will do, thanks!

^ permalink raw reply

* Re: [PATCH] kbuild, x86: revert macros in extended asm workarounds
From: Borislav Petkov via Virtualization @ 2018-12-16 10:00 UTC (permalink / raw)
  To: Nadav Amit, Masahiro Yamada
  Cc: linux-arch, Juergen Gross, Michal Marek, Richard Biener,
	Arnd Bergmann, Segher Boessenkool, Linux Kbuild mailing list,
	Peter Zijlstra, X86 ML, Linux Kernel Mailing List,
	virtualization@lists.linux-foundation.org,
	linux-sparse@vger.kernel.org, Ingo Molnar, Andy Lutomirski,
	H . Peter Anvin, Sedat Dilek, Thomas Gleixner, Logan Gunthorpe,
	Alok Kataria, Luc Van Oostenryck
In-Reply-To: <07BE39B2-1F99-4AE4-97F3-0871A39C5E7D@vmware.com>

On Sun, Dec 16, 2018 at 02:33:39AM +0000, Nadav Amit wrote:
> In general, I think that from the start it was clear that the motivation for
> the patch-set is not just performance and also better code. For example, I
> see no reason to revert the PV-changes or the lock-prefix changes that
> improved the code readability.

One thing that has caught my eye with the asm macros, which actually
decreases readability, is that I can't see the macro properly expanded
when I do

make <filename>.s

For example, I get

#APP
# 164 "./arch/x86/include/asm/cpufeature.h" 1
        STATIC_CPU_HAS bitnum=$8 cap_byte="boot_cpu_data+35(%rip)" feature=123 t_yes=.L75 t_no=.L78 always=117  #, MEM[(const char *)&boot_cpu_data + 35B],,,,
# 0 "" 2
        .loc 11 164 2 view .LVU480
#NO_APP

but I'd like to see the actual asm as it is really helpful when hacking
on inline asm stuff. And I haven't found a way to make gcc expand asm
macros in .s output.

Now, assuming the gcc inline patch will be backported to gcc8, I think
we should be covered on all modern distros going forward. So I think we
should revert at least the more complex macros.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply

* [PATCH v2] x86, kbuild: revert macrofying inline assembly code
From: Masahiro Yamada @ 2018-12-16  2:50 UTC (permalink / raw)
  To: x86, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	H . Peter Anvin
  Cc: linux-arch, Juergen Gross, Michal Marek, Richard Biener,
	Segher Boessenkool, Arnd Bergmann, linux-kbuild, linux-kernel,
	virtualization, Masahiro Yamada, linux-sparse, Nadav Amit,
	Alok Kataria, Luc Van Oostenryck

Revert the following 9 commits:

[1] 5bdcd510c2ac ("x86/jump-labels: Macrofy inline assembly code to
    work around GCC inlining bugs")

    This was partially reverted because it made good cleanups
    irrespective of the inlining issue; the error message is still
    unneeded, and the conversion to STATIC_BRANCH_{NOP,JUMP} should
    be kept.

[2] d5a581d84ae6 ("x86/cpufeature: Macrofy inline assembly code to
    work around GCC inlining bugs")

[3] 0474d5d9d2f7 ("x86/extable: Macrofy inline assembly code to work
    around GCC inlining bugs")

[4] 494b5168f2de ("x86/paravirt: Work around GCC inlining bugs when
    compiling paravirt ops")

[5] f81f8ad56fd1 ("x86/bug: Macrofy the BUG table section handling,
    to work around GCC inlining bugs")

[6] 77f48ec28e4c ("x86/alternatives: Macrofy lock prefixes to work
   around GCC inlining bugs")

[7] 9e1725b41059 ("x86/refcount: Work around GCC inlining bug")

    Resolved conflicts in arch/x86/include/asm/refcount.h caused by
    288e4521f0f6 ("x86/asm: 'Simplify' GEN_*_RMWcc() macros").

[8] c06c4d809051 ("x86/objtool: Use asm macros to work around GCC
    inlining bugs")

[9] 77b0bf55bc67 ("kbuild/Makefile: Prepare for using macros in inline
    assembly code to work around asm() related GCC inlining bugs")

A few days after those commits applied, discussion started to solve
the issue more elegantly with the help of compiler:

  https://lkml.org/lkml/2018/10/7/92

The new syntax "asm inline" was implemented by Segher Boessenkool, and
now queued up for GCC 9. (People were positive even for back-porting it
to older compilers).

Since the in-kernel workarounds merged, some issues have been reported:
breakage of building with distcc/icecc, breakage of distro packages for
module building. (More fundamentally, we cannot build external modules
after 'make clean'.)

I do not want to mess up the build system any more.

Given that this issue will be solved in a cleaner way sooner or later,
let's revert the in-kernel workarounds, and wait for GCC 9.

Reported-by: Logan Gunthorpe <logang@deltatee.com> # distcc
Reported-by: Sedat Dilek <sedat.dilek@gmail.com> # deb/rpm package
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
---

Please consider this for v4.20 release.
Currently, distro package build is broken.


Changes in v2:
  - Revive clean-ups made by 5bdcd510c2ac (per Peter Zijlstra)
  - Fix commit quoting style (per Peter Zijlstra)

 Makefile                               |  9 +---
 arch/x86/Makefile                      |  7 ---
 arch/x86/include/asm/alternative-asm.h | 20 +++----
 arch/x86/include/asm/alternative.h     | 11 +++-
 arch/x86/include/asm/asm.h             | 53 +++++++++++-------
 arch/x86/include/asm/bug.h             | 98 +++++++++++++++-------------------
 arch/x86/include/asm/cpufeature.h      | 82 ++++++++++++----------------
 arch/x86/include/asm/jump_label.h      | 22 ++++++--
 arch/x86/include/asm/paravirt_types.h  | 56 +++++++++----------
 arch/x86/include/asm/refcount.h        | 81 ++++++++++++----------------
 arch/x86/kernel/macros.S               | 16 ------
 include/asm-generic/bug.h              |  8 +--
 include/linux/compiler.h               | 56 +++++--------------
 scripts/Kbuild.include                 |  4 +-
 scripts/mod/Makefile                   |  2 -
 15 files changed, 224 insertions(+), 301 deletions(-)
 delete mode 100644 arch/x86/kernel/macros.S

diff --git a/Makefile b/Makefile
index f2c3423..4cf4c5b 100644
--- a/Makefile
+++ b/Makefile
@@ -1081,7 +1081,7 @@ scripts: scripts_basic scripts_dtc asm-generic gcc-plugins $(autoksyms_h)
 # version.h and scripts_basic is processed / created.
 
 # Listed in dependency order
-PHONY += prepare archprepare macroprepare prepare0 prepare1 prepare2 prepare3
+PHONY += prepare archprepare prepare0 prepare1 prepare2 prepare3
 
 # prepare3 is used to check if we are building in a separate output directory,
 # and if so do:
@@ -1104,9 +1104,7 @@ prepare2: prepare3 outputmakefile asm-generic
 prepare1: prepare2 $(version_h) $(autoksyms_h) include/generated/utsrelease.h
 	$(cmd_crmodverdir)
 
-macroprepare: prepare1 archmacros
-
-archprepare: archheaders archscripts macroprepare scripts_basic
+archprepare: archheaders archscripts prepare1 scripts_basic
 
 prepare0: archprepare gcc-plugins
 	$(Q)$(MAKE) $(build)=.
@@ -1174,9 +1172,6 @@ archheaders:
 PHONY += archscripts
 archscripts:
 
-PHONY += archmacros
-archmacros:
-
 PHONY += __headers
 __headers: $(version_h) scripts_basic uapi-asm-generic archheaders archscripts
 	$(Q)$(MAKE) $(build)=scripts build_unifdef
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 75ef499..85a66c4 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -232,13 +232,6 @@ archscripts: scripts_basic
 archheaders:
 	$(Q)$(MAKE) $(build)=arch/x86/entry/syscalls all
 
-archmacros:
-	$(Q)$(MAKE) $(build)=arch/x86/kernel arch/x86/kernel/macros.s
-
-ASM_MACRO_FLAGS = -Wa,arch/x86/kernel/macros.s
-export ASM_MACRO_FLAGS
-KBUILD_CFLAGS += $(ASM_MACRO_FLAGS)
-
 ###
 # Kernel objects
 
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 8e4ea39..31b627b 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -7,24 +7,16 @@
 #include <asm/asm.h>
 
 #ifdef CONFIG_SMP
-.macro LOCK_PREFIX_HERE
+	.macro LOCK_PREFIX
+672:	lock
 	.pushsection .smp_locks,"a"
 	.balign 4
-	.long 671f - .		# offset
+	.long 672b - .
 	.popsection
-671:
-.endm
-
-.macro LOCK_PREFIX insn:vararg
-	LOCK_PREFIX_HERE
-	lock \insn
-.endm
+	.endm
 #else
-.macro LOCK_PREFIX_HERE
-.endm
-
-.macro LOCK_PREFIX insn:vararg
-.endm
+	.macro LOCK_PREFIX
+	.endm
 #endif
 
 /*
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index d7faa16..4cd6a3b 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -31,8 +31,15 @@
  */
 
 #ifdef CONFIG_SMP
-#define LOCK_PREFIX_HERE "LOCK_PREFIX_HERE\n\t"
-#define LOCK_PREFIX "LOCK_PREFIX "
+#define LOCK_PREFIX_HERE \
+		".pushsection .smp_locks,\"a\"\n"	\
+		".balign 4\n"				\
+		".long 671f - .\n" /* offset */		\
+		".popsection\n"				\
+		"671:"
+
+#define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; "
+
 #else /* ! CONFIG_SMP */
 #define LOCK_PREFIX_HERE ""
 #define LOCK_PREFIX ""
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 21b0867..6467757b 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -120,25 +120,12 @@
 /* Exception table entry */
 #ifdef __ASSEMBLY__
 # define _ASM_EXTABLE_HANDLE(from, to, handler)			\
-	ASM_EXTABLE_HANDLE from to handler
-
-.macro ASM_EXTABLE_HANDLE from:req to:req handler:req
-	.pushsection "__ex_table","a"
-	.balign 4
-	.long (\from) - .
-	.long (\to) - .
-	.long (\handler) - .
+	.pushsection "__ex_table","a" ;				\
+	.balign 4 ;						\
+	.long (from) - . ;					\
+	.long (to) - . ;					\
+	.long (handler) - . ;					\
 	.popsection
-.endm
-#else /* __ASSEMBLY__ */
-
-# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
-	"ASM_EXTABLE_HANDLE from=" #from " to=" #to		\
-	" handler=\"" #handler "\"\n\t"
-
-/* For C file, we already have NOKPROBE_SYMBOL macro */
-
-#endif /* __ASSEMBLY__ */
 
 # define _ASM_EXTABLE(from, to)					\
 	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
@@ -161,7 +148,6 @@
 	_ASM_PTR (entry);					\
 	.popsection
 
-#ifdef __ASSEMBLY__
 .macro ALIGN_DESTINATION
 	/* check for bad alignment of destination */
 	movl %edi,%ecx
@@ -185,7 +171,34 @@
 	_ASM_EXTABLE_UA(100b, 103b)
 	_ASM_EXTABLE_UA(101b, 103b)
 	.endm
-#endif /* __ASSEMBLY__ */
+
+#else
+# define _EXPAND_EXTABLE_HANDLE(x) #x
+# define _ASM_EXTABLE_HANDLE(from, to, handler)			\
+	" .pushsection \"__ex_table\",\"a\"\n"			\
+	" .balign 4\n"						\
+	" .long (" #from ") - .\n"				\
+	" .long (" #to ") - .\n"				\
+	" .long (" _EXPAND_EXTABLE_HANDLE(handler) ") - .\n"	\
+	" .popsection\n"
+
+# define _ASM_EXTABLE(from, to)					\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_default)
+
+# define _ASM_EXTABLE_UA(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_uaccess)
+
+# define _ASM_EXTABLE_FAULT(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_fault)
+
+# define _ASM_EXTABLE_EX(from, to)				\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_ext)
+
+# define _ASM_EXTABLE_REFCOUNT(from, to)			\
+	_ASM_EXTABLE_HANDLE(from, to, ex_handler_refcount)
+
+/* For C file, we already have NOKPROBE_SYMBOL macro */
+#endif
 
 #ifndef __ASSEMBLY__
 /*
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 5090035..6804d66 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -4,8 +4,6 @@
 
 #include <linux/stringify.h>
 
-#ifndef __ASSEMBLY__
-
 /*
  * Despite that some emulators terminate on UD2, we use it for WARN().
  *
@@ -22,15 +20,53 @@
 
 #define LEN_UD2		2
 
+#ifdef CONFIG_GENERIC_BUG
+
+#ifdef CONFIG_X86_32
+# define __BUG_REL(val)	".long " __stringify(val)
+#else
+# define __BUG_REL(val)	".long " __stringify(val) " - 2b"
+#endif
+
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+
+#define _BUG_FLAGS(ins, flags)						\
+do {									\
+	asm volatile("1:\t" ins "\n"					\
+		     ".pushsection __bug_table,\"aw\"\n"		\
+		     "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n"	\
+		     "\t"  __BUG_REL(%c0) "\t# bug_entry::file\n"	\
+		     "\t.word %c1"        "\t# bug_entry::line\n"	\
+		     "\t.word %c2"        "\t# bug_entry::flags\n"	\
+		     "\t.org 2b+%c3\n"					\
+		     ".popsection"					\
+		     : : "i" (__FILE__), "i" (__LINE__),		\
+			 "i" (flags),					\
+			 "i" (sizeof(struct bug_entry)));		\
+} while (0)
+
+#else /* !CONFIG_DEBUG_BUGVERBOSE */
+
 #define _BUG_FLAGS(ins, flags)						\
 do {									\
-	asm volatile("ASM_BUG ins=\"" ins "\" file=%c0 line=%c1 "	\
-		     "flags=%c2 size=%c3"				\
-		     : : "i" (__FILE__), "i" (__LINE__),                \
-			 "i" (flags),                                   \
+	asm volatile("1:\t" ins "\n"					\
+		     ".pushsection __bug_table,\"aw\"\n"		\
+		     "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n"	\
+		     "\t.word %c0"        "\t# bug_entry::flags\n"	\
+		     "\t.org 2b+%c1\n"					\
+		     ".popsection"					\
+		     : : "i" (flags),					\
 			 "i" (sizeof(struct bug_entry)));		\
 } while (0)
 
+#endif /* CONFIG_DEBUG_BUGVERBOSE */
+
+#else
+
+#define _BUG_FLAGS(ins, flags)  asm volatile(ins)
+
+#endif /* CONFIG_GENERIC_BUG */
+
 #define HAVE_ARCH_BUG
 #define BUG()							\
 do {								\
@@ -46,54 +82,4 @@ do {								\
 
 #include <asm-generic/bug.h>
 
-#else /* __ASSEMBLY__ */
-
-#ifdef CONFIG_GENERIC_BUG
-
-#ifdef CONFIG_X86_32
-.macro __BUG_REL val:req
-	.long \val
-.endm
-#else
-.macro __BUG_REL val:req
-	.long \val - 2b
-.endm
-#endif
-
-#ifdef CONFIG_DEBUG_BUGVERBOSE
-
-.macro ASM_BUG ins:req file:req line:req flags:req size:req
-1:	\ins
-	.pushsection __bug_table,"aw"
-2:	__BUG_REL val=1b	# bug_entry::bug_addr
-	__BUG_REL val=\file	# bug_entry::file
-	.word \line		# bug_entry::line
-	.word \flags		# bug_entry::flags
-	.org 2b+\size
-	.popsection
-.endm
-
-#else /* !CONFIG_DEBUG_BUGVERBOSE */
-
-.macro ASM_BUG ins:req file:req line:req flags:req size:req
-1:	\ins
-	.pushsection __bug_table,"aw"
-2:	__BUG_REL val=1b	# bug_entry::bug_addr
-	.word \flags		# bug_entry::flags
-	.org 2b+\size
-	.popsection
-.endm
-
-#endif /* CONFIG_DEBUG_BUGVERBOSE */
-
-#else /* CONFIG_GENERIC_BUG */
-
-.macro ASM_BUG ins:req file:req line:req flags:req size:req
-	\ins
-.endm
-
-#endif /* CONFIG_GENERIC_BUG */
-
-#endif /* __ASSEMBLY__ */
-
 #endif /* _ASM_X86_BUG_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 7d44272..aced6c9 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -2,10 +2,10 @@
 #ifndef _ASM_X86_CPUFEATURE_H
 #define _ASM_X86_CPUFEATURE_H
 
-#ifdef __KERNEL__
-#ifndef __ASSEMBLY__
-
 #include <asm/processor.h>
+
+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+
 #include <asm/asm.h>
 #include <linux/bitops.h>
 
@@ -161,10 +161,37 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
  */
 static __always_inline __pure bool _static_cpu_has(u16 bit)
 {
-	asm_volatile_goto("STATIC_CPU_HAS bitnum=%[bitnum] "
-			  "cap_byte=\"%[cap_byte]\" "
-			  "feature=%P[feature] t_yes=%l[t_yes] "
-			  "t_no=%l[t_no] always=%P[always]"
+	asm_volatile_goto("1: jmp 6f\n"
+		 "2:\n"
+		 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
+			 "((5f-4f) - (2b-1b)),0x90\n"
+		 "3:\n"
+		 ".section .altinstructions,\"a\"\n"
+		 " .long 1b - .\n"		/* src offset */
+		 " .long 4f - .\n"		/* repl offset */
+		 " .word %P[always]\n"		/* always replace */
+		 " .byte 3b - 1b\n"		/* src len */
+		 " .byte 5f - 4f\n"		/* repl len */
+		 " .byte 3b - 2b\n"		/* pad len */
+		 ".previous\n"
+		 ".section .altinstr_replacement,\"ax\"\n"
+		 "4: jmp %l[t_no]\n"
+		 "5:\n"
+		 ".previous\n"
+		 ".section .altinstructions,\"a\"\n"
+		 " .long 1b - .\n"		/* src offset */
+		 " .long 0\n"			/* no replacement */
+		 " .word %P[feature]\n"		/* feature bit */
+		 " .byte 3b - 1b\n"		/* src len */
+		 " .byte 0\n"			/* repl len */
+		 " .byte 0\n"			/* pad len */
+		 ".previous\n"
+		 ".section .altinstr_aux,\"ax\"\n"
+		 "6:\n"
+		 " testb %[bitnum],%[cap_byte]\n"
+		 " jnz %l[t_yes]\n"
+		 " jmp %l[t_no]\n"
+		 ".previous\n"
 		 : : [feature]  "i" (bit),
 		     [always]   "i" (X86_FEATURE_ALWAYS),
 		     [bitnum]   "i" (1 << (bit & 7)),
@@ -199,44 +226,5 @@ static __always_inline __pure bool _static_cpu_has(u16 bit)
 #define CPU_FEATURE_TYPEVAL		boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
 					boot_cpu_data.x86_model
 
-#else /* __ASSEMBLY__ */
-
-.macro STATIC_CPU_HAS bitnum:req cap_byte:req feature:req t_yes:req t_no:req always:req
-1:
-	jmp 6f
-2:
-	.skip -(((5f-4f) - (2b-1b)) > 0) * ((5f-4f) - (2b-1b)),0x90
-3:
-	.section .altinstructions,"a"
-	.long 1b - .		/* src offset */
-	.long 4f - .		/* repl offset */
-	.word \always		/* always replace */
-	.byte 3b - 1b		/* src len */
-	.byte 5f - 4f		/* repl len */
-	.byte 3b - 2b		/* pad len */
-	.previous
-	.section .altinstr_replacement,"ax"
-4:
-	jmp \t_no
-5:
-	.previous
-	.section .altinstructions,"a"
-	.long 1b - .		/* src offset */
-	.long 0			/* no replacement */
-	.word \feature		/* feature bit */
-	.byte 3b - 1b		/* src len */
-	.byte 0			/* repl len */
-	.byte 0			/* pad len */
-	.previous
-	.section .altinstr_aux,"ax"
-6:
-	testb \bitnum,\cap_byte
-	jnz \t_yes
-	jmp \t_no
-	.previous
-.endm
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* __KERNEL__ */
+#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
 #endif /* _ASM_X86_CPUFEATURE_H */
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index a5fb34f..cf88ebf 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -20,9 +20,15 @@
 
 static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("STATIC_BRANCH_NOP l_yes=\"%l[l_yes]\" key=\"%c0\" "
-			  "branch=\"%c1\""
-			: :  "i" (key), "i" (branch) : : l_yes);
+	asm_volatile_goto("1:"
+		".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
+		".pushsection __jump_table,  \"aw\" \n\t"
+		_ASM_ALIGN "\n\t"
+		".long 1b - ., %l[l_yes] - . \n\t"
+		_ASM_PTR "%c0 + %c1 - .\n\t"
+		".popsection \n\t"
+		: :  "i" (key), "i" (branch) : : l_yes);
+
 	return false;
 l_yes:
 	return true;
@@ -30,8 +36,14 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool bran
 
 static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
 {
-	asm_volatile_goto("STATIC_BRANCH_JMP l_yes=\"%l[l_yes]\" key=\"%c0\" "
-			  "branch=\"%c1\""
+	asm_volatile_goto("1:"
+		".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t"
+		"2:\n\t"
+		".pushsection __jump_table,  \"aw\" \n\t"
+		_ASM_ALIGN "\n\t"
+		".long 1b - ., %l[l_yes] - . \n\t"
+		_ASM_PTR "%c0 + %c1 - .\n\t"
+		".popsection \n\t"
 		: :  "i" (key), "i" (branch) : : l_yes);
 
 	return false;
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 26942ad..488c596 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -348,11 +348,23 @@ extern struct paravirt_patch_template pv_ops;
 #define paravirt_clobber(clobber)		\
 	[paravirt_clobber] "i" (clobber)
 
+/*
+ * Generate some code, and mark it as patchable by the
+ * apply_paravirt() alternate instruction patcher.
+ */
+#define _paravirt_alt(insn_string, type, clobber)	\
+	"771:\n\t" insn_string "\n" "772:\n"		\
+	".pushsection .parainstructions,\"a\"\n"	\
+	_ASM_ALIGN "\n"					\
+	_ASM_PTR " 771b\n"				\
+	"  .byte " type "\n"				\
+	"  .byte 772b-771b\n"				\
+	"  .short " clobber "\n"			\
+	".popsection\n"
+
 /* Generate patchable code, with the default asm parameters. */
-#define paravirt_call							\
-	"PARAVIRT_CALL type=\"%c[paravirt_typenum]\""			\
-	" clobber=\"%c[paravirt_clobber]\""				\
-	" pv_opptr=\"%c[paravirt_opptr]\";"
+#define paravirt_alt(insn_string)					\
+	_paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
 
 /* Simple instruction patching code. */
 #define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
@@ -373,6 +385,16 @@ unsigned native_patch(u8 type, void *ibuf, unsigned long addr, unsigned len);
 int paravirt_disable_iospace(void);
 
 /*
+ * This generates an indirect call based on the operation type number.
+ * The type number, computed in PARAVIRT_PATCH, is derived from the
+ * offset into the paravirt_patch_template structure, and can therefore be
+ * freely converted back into a structure offset.
+ */
+#define PARAVIRT_CALL					\
+	ANNOTATE_RETPOLINE_SAFE				\
+	"call *%c[paravirt_opptr];"
+
+/*
  * These macros are intended to wrap calls through one of the paravirt
  * ops structs, so that they can be later identified and patched at
  * runtime.
@@ -509,7 +531,7 @@ int paravirt_disable_iospace(void);
 		/* since this condition will never hold */		\
 		if (sizeof(rettype) > sizeof(unsigned long)) {		\
 			asm volatile(pre				\
-				     paravirt_call			\
+				     paravirt_alt(PARAVIRT_CALL)	\
 				     post				\
 				     : call_clbr, ASM_CALL_CONSTRAINT	\
 				     : paravirt_type(op),		\
@@ -519,7 +541,7 @@ int paravirt_disable_iospace(void);
 			__ret = (rettype)((((u64)__edx) << 32) | __eax); \
 		} else {						\
 			asm volatile(pre				\
-				     paravirt_call			\
+				     paravirt_alt(PARAVIRT_CALL)	\
 				     post				\
 				     : call_clbr, ASM_CALL_CONSTRAINT	\
 				     : paravirt_type(op),		\
@@ -546,7 +568,7 @@ int paravirt_disable_iospace(void);
 		PVOP_VCALL_ARGS;					\
 		PVOP_TEST_NULL(op);					\
 		asm volatile(pre					\
-			     paravirt_call				\
+			     paravirt_alt(PARAVIRT_CALL)		\
 			     post					\
 			     : call_clbr, ASM_CALL_CONSTRAINT		\
 			     : paravirt_type(op),			\
@@ -664,26 +686,6 @@ struct paravirt_patch_site {
 extern struct paravirt_patch_site __parainstructions[],
 	__parainstructions_end[];
 
-#else	/* __ASSEMBLY__ */
-
-/*
- * This generates an indirect call based on the operation type number.
- * The type number, computed in PARAVIRT_PATCH, is derived from the
- * offset into the paravirt_patch_template structure, and can therefore be
- * freely converted back into a structure offset.
- */
-.macro PARAVIRT_CALL type:req clobber:req pv_opptr:req
-771:	ANNOTATE_RETPOLINE_SAFE
-	call *\pv_opptr
-772:	.pushsection .parainstructions,"a"
-	_ASM_ALIGN
-	_ASM_PTR 771b
-	.byte \type
-	.byte 772b-771b
-	.short \clobber
-	.popsection
-.endm
-
 #endif	/* __ASSEMBLY__ */
 
 #endif	/* _ASM_X86_PARAVIRT_TYPES_H */
diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
index a8b5e1e..dbaed55 100644
--- a/arch/x86/include/asm/refcount.h
+++ b/arch/x86/include/asm/refcount.h
@@ -4,41 +4,6 @@
  * x86-specific implementation of refcount_t. Based on PAX_REFCOUNT from
  * PaX/grsecurity.
  */
-
-#ifdef __ASSEMBLY__
-
-#include <asm/asm.h>
-#include <asm/bug.h>
-
-.macro REFCOUNT_EXCEPTION counter:req
-	.pushsection .text..refcount
-111:	lea \counter, %_ASM_CX
-112:	ud2
-	ASM_UNREACHABLE
-	.popsection
-113:	_ASM_EXTABLE_REFCOUNT(112b, 113b)
-.endm
-
-/* Trigger refcount exception if refcount result is negative. */
-.macro REFCOUNT_CHECK_LT_ZERO counter:req
-	js 111f
-	REFCOUNT_EXCEPTION counter="\counter"
-.endm
-
-/* Trigger refcount exception if refcount result is zero or negative. */
-.macro REFCOUNT_CHECK_LE_ZERO counter:req
-	jz 111f
-	REFCOUNT_CHECK_LT_ZERO counter="\counter"
-.endm
-
-/* Trigger refcount exception unconditionally. */
-.macro REFCOUNT_ERROR counter:req
-	jmp 111f
-	REFCOUNT_EXCEPTION counter="\counter"
-.endm
-
-#else /* __ASSEMBLY__ */
-
 #include <linux/refcount.h>
 #include <asm/bug.h>
 
@@ -50,12 +15,35 @@
  * central refcount exception. The fixup address for the exception points
  * back to the regular execution flow in .text.
  */
+#define _REFCOUNT_EXCEPTION				\
+	".pushsection .text..refcount\n"		\
+	"111:\tlea %[var], %%" _ASM_CX "\n"		\
+	"112:\t" ASM_UD2 "\n"				\
+	ASM_UNREACHABLE					\
+	".popsection\n"					\
+	"113:\n"					\
+	_ASM_EXTABLE_REFCOUNT(112b, 113b)
+
+/* Trigger refcount exception if refcount result is negative. */
+#define REFCOUNT_CHECK_LT_ZERO				\
+	"js 111f\n\t"					\
+	_REFCOUNT_EXCEPTION
+
+/* Trigger refcount exception if refcount result is zero or negative. */
+#define REFCOUNT_CHECK_LE_ZERO				\
+	"jz 111f\n\t"					\
+	REFCOUNT_CHECK_LT_ZERO
+
+/* Trigger refcount exception unconditionally. */
+#define REFCOUNT_ERROR					\
+	"jmp 111f\n\t"					\
+	_REFCOUNT_EXCEPTION
 
 static __always_inline void refcount_add(unsigned int i, refcount_t *r)
 {
 	asm volatile(LOCK_PREFIX "addl %1,%0\n\t"
-		"REFCOUNT_CHECK_LT_ZERO counter=\"%[counter]\""
-		: [counter] "+m" (r->refs.counter)
+		REFCOUNT_CHECK_LT_ZERO
+		: [var] "+m" (r->refs.counter)
 		: "ir" (i)
 		: "cc", "cx");
 }
@@ -63,32 +51,31 @@ static __always_inline void refcount_add(unsigned int i, refcount_t *r)
 static __always_inline void refcount_inc(refcount_t *r)
 {
 	asm volatile(LOCK_PREFIX "incl %0\n\t"
-		"REFCOUNT_CHECK_LT_ZERO counter=\"%[counter]\""
-		: [counter] "+m" (r->refs.counter)
+		REFCOUNT_CHECK_LT_ZERO
+		: [var] "+m" (r->refs.counter)
 		: : "cc", "cx");
 }
 
 static __always_inline void refcount_dec(refcount_t *r)
 {
 	asm volatile(LOCK_PREFIX "decl %0\n\t"
-		"REFCOUNT_CHECK_LE_ZERO counter=\"%[counter]\""
-		: [counter] "+m" (r->refs.counter)
+		REFCOUNT_CHECK_LE_ZERO
+		: [var] "+m" (r->refs.counter)
 		: : "cc", "cx");
 }
 
 static __always_inline __must_check
 bool refcount_sub_and_test(unsigned int i, refcount_t *r)
 {
-
 	return GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl",
-					 "REFCOUNT_CHECK_LT_ZERO counter=\"%[var]\"",
+					 REFCOUNT_CHECK_LT_ZERO,
 					 r->refs.counter, e, "er", i, "cx");
 }
 
 static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r)
 {
 	return GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl",
-					"REFCOUNT_CHECK_LT_ZERO counter=\"%[var]\"",
+					REFCOUNT_CHECK_LT_ZERO,
 					r->refs.counter, e, "cx");
 }
 
@@ -106,8 +93,8 @@ bool refcount_add_not_zero(unsigned int i, refcount_t *r)
 
 		/* Did we try to increment from/to an undesirable state? */
 		if (unlikely(c < 0 || c == INT_MAX || result < c)) {
-			asm volatile("REFCOUNT_ERROR counter=\"%[counter]\""
-				     : : [counter] "m" (r->refs.counter)
+			asm volatile(REFCOUNT_ERROR
+				     : : [var] "m" (r->refs.counter)
 				     : "cc", "cx");
 			break;
 		}
@@ -122,6 +109,4 @@ static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r)
 	return refcount_add_not_zero(1, r);
 }
 
-#endif /* __ASSEMBLY__ */
-
 #endif
diff --git a/arch/x86/kernel/macros.S b/arch/x86/kernel/macros.S
deleted file mode 100644
index 161c950..0000000
--- a/arch/x86/kernel/macros.S
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-/*
- * This file includes headers whose assembly part includes macros which are
- * commonly used. The macros are precompiled into assmebly file which is later
- * assembled together with each compiled file.
- */
-
-#include <linux/compiler.h>
-#include <asm/refcount.h>
-#include <asm/alternative-asm.h>
-#include <asm/bug.h>
-#include <asm/paravirt.h>
-#include <asm/asm.h>
-#include <asm/cpufeature.h>
-#include <asm/jump_label.h>
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index cdafa5e..20561a6 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -17,8 +17,10 @@
 #ifndef __ASSEMBLY__
 #include <linux/kernel.h>
 
-struct bug_entry {
+#ifdef CONFIG_BUG
+
 #ifdef CONFIG_GENERIC_BUG
+struct bug_entry {
 #ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS
 	unsigned long	bug_addr;
 #else
@@ -33,10 +35,8 @@ struct bug_entry {
 	unsigned short	line;
 #endif
 	unsigned short	flags;
-#endif	/* CONFIG_GENERIC_BUG */
 };
-
-#ifdef CONFIG_BUG
+#endif	/* CONFIG_GENERIC_BUG */
 
 /*
  * Don't use BUG() or BUG_ON() unless there's really no way out; one
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 06396c1..fc5004a 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -99,13 +99,22 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
  * unique, to convince GCC not to merge duplicate inline asm statements.
  */
 #define annotate_reachable() ({						\
-	asm volatile("ANNOTATE_REACHABLE counter=%c0"			\
-		     : : "i" (__COUNTER__));				\
+	asm volatile("%c0:\n\t"						\
+		     ".pushsection .discard.reachable\n\t"		\
+		     ".long %c0b - .\n\t"				\
+		     ".popsection\n\t" : : "i" (__COUNTER__));		\
 })
 #define annotate_unreachable() ({					\
-	asm volatile("ANNOTATE_UNREACHABLE counter=%c0"			\
-		     : : "i" (__COUNTER__));				\
+	asm volatile("%c0:\n\t"						\
+		     ".pushsection .discard.unreachable\n\t"		\
+		     ".long %c0b - .\n\t"				\
+		     ".popsection\n\t" : : "i" (__COUNTER__));		\
 })
+#define ASM_UNREACHABLE							\
+	"999:\n\t"							\
+	".pushsection .discard.unreachable\n\t"				\
+	".long 999b - .\n\t"						\
+	".popsection\n\t"
 #else
 #define annotate_reachable()
 #define annotate_unreachable()
@@ -293,45 +302,6 @@ static inline void *offset_to_ptr(const int *off)
 	return (void *)((unsigned long)off + *off);
 }
 
-#else /* __ASSEMBLY__ */
-
-#ifdef __KERNEL__
-#ifndef LINKER_SCRIPT
-
-#ifdef CONFIG_STACK_VALIDATION
-.macro ANNOTATE_UNREACHABLE counter:req
-\counter:
-	.pushsection .discard.unreachable
-	.long \counter\()b -.
-	.popsection
-.endm
-
-.macro ANNOTATE_REACHABLE counter:req
-\counter:
-	.pushsection .discard.reachable
-	.long \counter\()b -.
-	.popsection
-.endm
-
-.macro ASM_UNREACHABLE
-999:
-	.pushsection .discard.unreachable
-	.long 999b - .
-	.popsection
-.endm
-#else /* CONFIG_STACK_VALIDATION */
-.macro ANNOTATE_UNREACHABLE counter:req
-.endm
-
-.macro ANNOTATE_REACHABLE counter:req
-.endm
-
-.macro ASM_UNREACHABLE
-.endm
-#endif /* CONFIG_STACK_VALIDATION */
-
-#endif /* LINKER_SCRIPT */
-#endif /* __KERNEL__ */
 #endif /* __ASSEMBLY__ */
 
 /* Compile time object size, -1 for unknown */
diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
index bb01555..3d09844 100644
--- a/scripts/Kbuild.include
+++ b/scripts/Kbuild.include
@@ -115,9 +115,7 @@ __cc-option = $(call try-run,\
 
 # Do not attempt to build with gcc plugins during cc-option tests.
 # (And this uses delayed resolution so the flags will be up to date.)
-# In addition, do not include the asm macros which are built later.
-CC_OPTION_FILTERED = $(GCC_PLUGINS_CFLAGS) $(ASM_MACRO_FLAGS)
-CC_OPTION_CFLAGS = $(filter-out $(CC_OPTION_FILTERED),$(KBUILD_CFLAGS))
+CC_OPTION_CFLAGS = $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS))
 
 # cc-option
 # Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586)
diff --git a/scripts/mod/Makefile b/scripts/mod/Makefile
index a5b4af4..42c5d50 100644
--- a/scripts/mod/Makefile
+++ b/scripts/mod/Makefile
@@ -4,8 +4,6 @@ OBJECT_FILES_NON_STANDARD := y
 hostprogs-y	:= modpost mk_elfconfig
 always		:= $(hostprogs-y) empty.o
 
-CFLAGS_REMOVE_empty.o := $(ASM_MACRO_FLAGS)
-
 modpost-objs	:= modpost.o file2alias.o sumversion.o
 
 devicetable-offsets-file := devicetable-offsets.h
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH net-next 3/3] vhost: access vq metadata through kernel virtual address
From: David Miller @ 2018-12-15 21:15 UTC (permalink / raw)
  To: jasowang; +Cc: netdev, virtualization, linux-kernel, kvm, mst
In-Reply-To: <d70c5ac1-c805-cefd-ace1-643a2e271ba0@redhat.com>

From: Jason Wang <jasowang@redhat.com>
Date: Fri, 14 Dec 2018 11:57:35 +0800

> This is the price of all GUP users not only vhost itself. What's more
> important, the goal is not to be left too much behind for other
> backends like DPDK or AF_XDP (all of which are using GUP).

+1

^ permalink raw reply

* Re: [PATCH] vhost: return EINVAL if iovecs size does not match the message size
From: David Miller @ 2018-12-15 19:46 UTC (permalink / raw)
  To: ptikhomirov; +Cc: kvm, mst, netdev, linux-kernel, virtualization, khorenko
In-Reply-To: <20181213145350.5454-1-ptikhomirov@virtuozzo.com>

From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Date: Thu, 13 Dec 2018 17:53:50 +0300

> We've failed to copy and process vhost_iotlb_msg so let userspace at
> least know about it. For instance before these patch the code below runs
> without any error:
 ...
> Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>

Michael, will you be taking this in via your tree?

Thanks.

^ permalink raw reply

* Re: [PATCH net-next 0/3] vhost: accelerate metadata access through vmap()
From: David Miller @ 2018-12-15 19:43 UTC (permalink / raw)
  To: jasowang; +Cc: netdev, virtualization, linux-kernel, kvm, mst
In-Reply-To: <836932fc-9266-b73d-2ee5-645f399e1a54@redhat.com>

From: Jason Wang <jasowang@redhat.com>
Date: Fri, 14 Dec 2018 12:29:54 +0800

> 
> On 2018/12/14 上午4:12, Michael S. Tsirkin wrote:
>> On Thu, Dec 13, 2018 at 06:10:19PM +0800, Jason Wang wrote:
>>> Hi:
>>>
>>> This series tries to access virtqueue metadata through kernel virtual
>>> address instead of copy_user() friends since they had too much
>>> overheads like checks, spec barriers or even hardware feature
>>> toggling.
>>>
>>> Test shows about 24% improvement on TX PPS. It should benefit other
>>> cases as well.
>>>
>>> Please review
>> I think the idea of speeding up userspace access is a good one.
>> However I think that moving all checks to start is way too aggressive.
> 
> 
> So did packet and AF_XDP. Anyway, sharing address space and access
> them directly is the fastest way. Performance is the major
> consideration for people to choose backend. Compare to userspace
> implementation, vhost does not have security advantages at any
> level. If vhost is still slow, people will start to develop backends
> based on e.g AF_XDP.

Exactly, this is precisely how this kind of problem should be solved.

Michael, I strongly support the approach Jason is taking here, and I
would like to ask you to seriously reconsider your objections.

Thank you.

^ permalink raw reply

* Call for Papers - MICRADS 2019, Rio de Janeiro, Brazil | Deadline: December 28
From: Marle @ 2018-12-15 18:50 UTC (permalink / raw)
  To: virtualization

[-- Attachment #1.1: Type: text/plain, Size: 5881 bytes --]

***** Proceedings by Springer

--------------------------------------

MICRADS´19 - The 2019 Multidisciplinary International Conference of Research Applied to Defense and Security

Rio de Janeiro, Brazil, 8 - 10 May 2019

http://www.micrads.org/ <http://www.micrads.org/>

------------------------------------------------------------------------------------------------------------------------------

Scope

MICRADS´19 - The 2019 Multidisciplinary International Conference of Research Applied to Defense and Security, to be held at Rio de Janeiro, Brazil, 8 - 10 May 2019, is an international forum for researchers and practitioners to present and discuss the most recent innovations, trends, results, experiences and concerns in the several perspectives of Defense and Security.

We are pleased to invite you to submit your papers to MICRADS´19. They can be written in English, Spanish or Portuguese. All submissions will be reviewed on the basis of relevance, originality, importance and clarity.

Topics

Submitted papers should be related with one or more of the main themes proposed for the Conference:

Area A: Systems, Communication and Defense

A1) Information and Communication Technology in Education
A2) Simulation and computer vision in military applications
A3) Analysis and Signal Processing
A4) Cybersecurity and Cyberdefense
A5) Computer Networks, Mobility and Pervasive Systems

Area B: Strategy and political-administrative vision in Defense

B1) Safety and Maritime Protection
B2) Strategy, Geopolitics and Oceanopolitics
B3) Planning, economy and logistics applied to Defense
B4) Leadership and e-leadership
B5) Military Marketing
B6) Health informatics in military applications

Area C: Engineering and technologies applied to Defense

C1) Wearable Technology and Assistance Devices
C2) Military Naval Engineering
C3) Weapons and Combat Systems
C4) Chemical, Biological and Nuclear Defense
C5) Defense Engineering (General)

Submission and Decision

Submitted papers written in English (until 10-page limit) must comply with the format of Smart Innovation, Systems and Technologies series (see Instructions for Authors at Springer Website <https://www.springer.com/us/authors-editors/conference-proceedings/conference-proceedings-guidelines>), must not have been published before, not be under review for any other conference or publication and not include any information leading to the authors’ identification. Therefore, the authors’ names, affiliations and e-mails should not be included in the version for evaluation by the Scientific Committee. This information should only be included in the camera-ready version, saved in Word or Latex format and also in PDF format. These files must be accompanied by the Consent to Publish form <http://www.micrads.org/consent.doc> filled out, in a ZIP file, and uploaded at the conference management system.

Submitted papers written in Spanish or Portuguese (until 15-page limit) must comply with the format of RISTI <http://www.risti.xyz/> - Revista Ibérica de Sistemas e Tecnologias de Informação (download instructions/template for authors in Spanish <http://www.risti.xyz/formato-es.doc> or Portuguese <http://www.risti.xyz/formato-pt.doc>), must not have been published before, not be under review for any other conference or publication and not include any information leading to the authors’ identification. Therefore, the authors’ names, affiliations and e-mails should not be included in the version for evaluation by the Scientific Committee. This information should only be included in the camera-ready version, saved in Word. These file must be uploaded at the conference management system in a ZIP file.

All papers will be subjected to a “blind review” by at least two members of the Scientific Committee.

Based on Scientific Committee evaluation, a paper can be rejected or accepted by the Conference Chairs. In the later case, it can be accepted as paper or poster.

The authors of papers accepted as posters must build and print a poster to be exhibited during the Conference. This poster must follow an A1 or A2 vertical format. The Conference can includes Work Sessions where these posters are presented and orally discussed, with a 7 minute limit per poster.

The authors of accepted papers will have 15 minutes to present their work in a Conference Work Session; approximately 5 minutes of discussion will follow each presentation.

Publication and Indexing

To ensure that an accepted paper is published, at least one of the authors must be fully registered by the 11 of February 2019, and the paper must comply with the suggested layout and page-limit (until 10 pages). Additionally, all recommended changes must be addressed by the authors before they submit the camera-ready version.

No more than one paper per registration will be published. An extra fee must be paid for publication of additional papers, with a maximum of one additional paper per registration. One registration permits only the participation of one author in the conference.

Papers can be written in English, Spanish or Portuguese. Accepted and registered papers written in English will be published in Proceedings by Springer, in a book of its SIST series, and will be submitted for indexing by ISI, SCOPUS, EI-Compendex, SpringerLink, and Google Scholar.

Important Dates

Paper Submission: December 28, 2018

Notification of Acceptance: January 28, 2019

Payment of Registration, to ensure the inclusion of an accepted paper in the conference proceedings: February 11, 2019.

Camera-ready Submission: February 11, 2019

Website of MICRADS'19: http://www.micrads.org/ <http://www.micrads.org/>

---
This email has been checked for viruses by AVG.
https://www.avg.com

[-- Attachment #1.2: Type: text/html, Size: 8900 bytes --]

[-- Attachment #2: Type: text/plain, Size: 183 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next 0/3] vhost: accelerate metadata access through vmap()
From: Michael S. Tsirkin @ 2018-12-14 15:31 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20181214072334-mutt-send-email-mst@kernel.org>

On Fri, Dec 14, 2018 at 07:33:20AM -0500, Michael S. Tsirkin wrote:
> > - unsafe_put_user/unsafe_get_user is not sufficient for accessing structures
> > (e.g accessing descriptor) or arrays (batching).
> 
> So you want unsafe_copy_xxx_user? I can do this. Hang on will post.

Like this basically? Warning: completely untested.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>


diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index b5e58cc0c5e7..d2afd70793ca 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -728,5 +728,10 @@ do {										\
 	if (unlikely(__gu_err)) goto err_label;					\
 } while (0)
 
+#define unsafe_copy_from_user(to, from, n)					\
+	 __copy_user_ll(to, (__force const void *)from, n)
+#define unsafe_copy_to_user(to, from, n)					\
+	 __copy_user_ll((__force void *)to, from, n)
+
 #endif /* _ASM_X86_UACCESS_H */
 
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index efe79c1cdd47..b9d515ba2920 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -271,6 +271,8 @@ extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
 #define user_access_end() do { } while (0)
 #define unsafe_get_user(x, ptr, err) do { if (unlikely(__get_user(x, ptr))) goto err; } while (0)
 #define unsafe_put_user(x, ptr, err) do { if (unlikely(__put_user(x, ptr))) goto err; } while (0)
+#define unsafe_copy_to_user(from, to, n) copy_to_user(from, to, n)
+#define unsafe_copy_from_user(from, to, n) copy_from_user(from, to, n)
 #endif
 
 #ifdef CONFIG_HARDENED_USERCOPY

^ permalink raw reply related

* Re: [PATCH net-next 0/3] vhost: accelerate metadata access through vmap()
From: Michael S. Tsirkin @ 2018-12-14 15:16 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20181213101022.12475-1-jasowang@redhat.com>

On Thu, Dec 13, 2018 at 06:10:19PM +0800, Jason Wang wrote:
> Hi:
> 
> This series tries to access virtqueue metadata through kernel virtual
> address instead of copy_user() friends since they had too much
> overheads like checks, spec barriers or even hardware feature
> toggling.
> 
> Test shows about 24% improvement on TX PPS. It should benefit other
> cases as well.

BTW if the issue is all the error checking, maybe we should
consider using something like uaccess_catch and check
in a single place.

> Please review
> 
> Jason Wang (3):
>   vhost: generalize adding used elem
>   vhost: fine grain userspace memory accessors
>   vhost: access vq metadata through kernel virtual address
> 
>  drivers/vhost/vhost.c | 281 ++++++++++++++++++++++++++++++++++++++----
>  drivers/vhost/vhost.h |  11 ++
>  2 files changed, 266 insertions(+), 26 deletions(-)
> 
> -- 
> 2.17.1

^ permalink raw reply

* Re: [PATCH net-next 3/3] vhost: access vq metadata through kernel virtual address
From: kbuild test robot @ 2018-12-14 14:48 UTC (permalink / raw)
  To: Jason Wang; +Cc: kvm, mst, netdev, linux-kernel, virtualization, kbuild-all
In-Reply-To: <20181213101022.12475-4-jasowang@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 2071 bytes --]

Hi Jason,

I love your patch! Yet something to improve:

[auto build test ERROR on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Jason-Wang/vhost-accelerate-metadata-access-through-vmap/20181214-200417
config: mips-malta_kvm_defconfig (attached as .config)
compiler: mipsel-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.2.0 make.cross ARCH=mips 

All errors (new ones prefixed by >>):

   drivers//vhost/vhost.c: In function 'vhost_init_vmap':
>> drivers//vhost/vhost.c:648:3: error: implicit declaration of function 'release_pages'; did you mean 'release_task'? [-Werror=implicit-function-declaration]
      release_pages(pages, npinned);
      ^~~~~~~~~~~~~
      release_task
   cc1: some warnings being treated as errors

vim +648 drivers//vhost/vhost.c

   619	
   620	static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr,
   621				   size_t size, int write)
   622	{
   623		struct page **pages;
   624		int npages = DIV_ROUND_UP(size, PAGE_SIZE);
   625		int npinned;
   626		void *vaddr;
   627	
   628		pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
   629		if (!pages)
   630			return -ENOMEM;
   631	
   632		npinned = get_user_pages_fast(uaddr, npages, write, pages);
   633		if (npinned != npages)
   634			goto err;
   635	
   636		vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL);
   637		if (!vaddr)
   638			goto err;
   639	
   640		map->pages = pages;
   641		map->addr = vaddr + (uaddr & (PAGE_SIZE - 1));
   642		map->npages = npages;
   643	
   644		return 0;
   645	
   646	err:
   647		if (npinned > 0)
 > 648			release_pages(pages, npinned);
   649		kfree(pages);
   650		return -EFAULT;
   651	}
   652	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 19468 bytes --]

[-- Attachment #3: Type: text/plain, Size: 183 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH v2 0/5] VSOCK: support mergeable rx buffer in vhost-vsock
From: Michael S. Tsirkin @ 2018-12-14 13:22 UTC (permalink / raw)
  To: jiangyiwen; +Cc: netdev, kvm, Stefan Hajnoczi, virtualization
In-Reply-To: <5C1384E8.7040506@huawei.com>

On Fri, Dec 14, 2018 at 06:24:40PM +0800, jiangyiwen wrote:
> On 2018/12/12 23:09, Michael S. Tsirkin wrote:
> > On Wed, Dec 12, 2018 at 05:25:50PM +0800, jiangyiwen wrote:
> >> Now vsock only support send/receive small packet, it can't achieve
> >> high performance. As previous discussed with Jason Wang, I revisit the
> >> idea of vhost-net about mergeable rx buffer and implement the mergeable
> >> rx buffer in vhost-vsock, it can allow big packet to be scattered in
> >> into different buffers and improve performance obviously.
> >>
> >> This series of patches mainly did three things：
> >> - mergeable buffer implementation
> >> - increase the max send pkt size
> >> - add used and signal guest in a batch
> >>
> >> And I write a tool to test the vhost-vsock performance, mainly send big
> >> packet(64K) included guest->Host and Host->Guest. I test performance
> >> independently and the result as follows:
> >>
> >> Before performance:
> >>               Single socket            Multiple sockets(Max Bandwidth)
> >> Guest->Host   ~400MB/s                 ~480MB/s
> >> Host->Guest   ~1450MB/s                ~1600MB/s
> >>
> >> After performance only use implement mergeable rx buffer:
> >>               Single socket            Multiple sockets(Max Bandwidth)
> >> Guest->Host   ~400MB/s                 ~480MB/s
> >> Host->Guest   ~1280MB/s                ~1350MB/s
> >>
> >> In this case, max send pkt size is still limited to 4K, so Host->Guest
> >> performance will worse than before.
> > 
> > It's concerning though, what if application sends small packets?
> > What is the source of the slowdown? Do you know?
> > 
> 
> Hi Michael,
> 
> To the two cases, I test the results included small and big packets as
> follows:
> 
> 64K packets performance comparison:
>                                               Single socket    Multiple sockets
> Host->Guest(before)                           1352.60MB/s      1436.33MB/s
> 
> 
> Host->Guest(only use mergeable rx buffer)     1290.08MB/s      1212.67MB/s
> 
> 4K packets performance comparison:
>                                               Single socket    Multiple sockets
> Host->Guest(before)                           535.47MB/s       688.67MB/s
> Host->Guest(only use mergeable rx buffer)     522.33MB/s       599.00MB/s
> 
> 3K packets performance comparison:
>                                               Single socket    Multiple sockets
> Host->Guest(before)                           359.74MB/s       442.00MB/s
> Host->Guest(only use mergeable rx buffer)     374.47MB/s       452.33MB/s
> 
> We can see an interesting thing, for 64K and 4K packets,
> using mergeable buffer has a poor performance, for 3K packet,
> both have the same performance.
> 
> I guess in mergeable mode, when host send a 4k packet to guest, we
> should call vhost_get_vq_desc() twice in host(hdr + 4k data),
> and in guest we also should call virtqueue_get_buf() twice. So
> when packet is smaller than (4k - hdr), it can be packed in a
> single page, so the performance is the same as before.
> 
> So in the mergeable mode, the performance may be
> worse in ((4k - hdr), 4k] than before.
> 
> Thanks,
> Yiwen.


The conclusion seems to be that mergeable buffers themselves
only hurt performance, but they allow batching which improves
performance. So let's add batching without mergeable buffers then?


> >> After performance increase the max send pkt size to 64K:
> >>               Single socket            Multiple sockets(Max Bandwidth)
> >> Guest->Host   ~1700MB/s                ~2900MB/s
> >> Host->Guest   ~1500MB/s                ~2440MB/s
> >>
> >> After performance all patches are used:
> >>               Single socket            Multiple sockets(Max Bandwidth)
> >> Guest->Host   ~1700MB/s                ~2900MB/s
> >> Host->Guest   ~1700MB/s                ~2900MB/s
> >>
> >> >From the test results, the performance is improved obviously, and guest
> >> memory will not be wasted.
> >>
> >> In addition, in order to support mergeable rx buffer in virtio-vsock,
> >> we need to add a qemu patch to support parse feature.
> >>
> >> ---
> >> v1 -> v2:
> >>  * Addressed comments from Jason Wang.
> >>  * Add performance test result independently.
> >>  * Use Skb_page_frag_refill() which can use high order page and reduce
> >>    the stress of page allocator.
> >>  * Still use fixed size(PAGE_SIZE) to fill rx buffer, because too small
> >>    size can't fill one full packet, we only 128 vq num now.
> >>  * Use iovec to replace buf in struct virtio_vsock_pkt, keep tx and rx
> >>    consistency.
> >>  * Add virtio_transport ops to get max pkt len, in order to be compatible
> >>    with old version.
> >> ---
> >>
> >> Yiwen Jiang (5):
> >>   VSOCK: support fill mergeable rx buffer in guest
> >>   VSOCK: support fill data to mergeable rx buffer in host
> >>   VSOCK: support receive mergeable rx buffer in guest
> >>   VSOCK: increase send pkt len in mergeable mode to improve performance
> >>   VSOCK: batch sending rx buffer to increase bandwidth
> >>
> >>  drivers/vhost/vsock.c                   | 183 ++++++++++++++++++++-----
> >>  include/linux/virtio_vsock.h            |  13 +-
> >>  include/uapi/linux/virtio_vsock.h       |   5 +
> >>  net/vmw_vsock/virtio_transport.c        | 229 +++++++++++++++++++++++++++-----
> >>  net/vmw_vsock/virtio_transport_common.c |  66 ++++++---
> >>  5 files changed, 411 insertions(+), 85 deletions(-)
> >>
> >> -- 
> >> 1.8.3.1
> > 
> > .
> > 
> 
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net V2 4/4] vhost: log dirty page correctly
From: Michael S. Tsirkin @ 2018-12-14 13:20 UTC (permalink / raw)
  To: Jason Wang; +Cc: Jintack Lim, netdev, linux-kernel, kvm, virtualization
In-Reply-To: <519ee6f7-06fc-ad49-03da-c096aeb24ced@redhat.com>

On Fri, Dec 14, 2018 at 10:43:03AM +0800, Jason Wang wrote:
> 
> On 2018/12/13 下午10:31, Michael S. Tsirkin wrote:
> > > Just to make sure I understand this. It looks to me we should:
> > > 
> > > - allow passing GIOVA->GPA through UAPI
> > > 
> > > - cache GIOVA->GPA somewhere but still use GIOVA->HVA in device IOTLB for
> > > performance
> > > 
> > > Is this what you suggest?
> > > 
> > > Thanks
> > Not really. We already have GPA->HVA, so I suggested a flag to pass
> > GIOVA->GPA in the IOTLB.
> > 
> > This has advantages for security since a single table needs
> > then to be validated to ensure guest does not corrupt
> > QEMU memory.
> > 
> 
> I wonder how much we can gain through this. Currently, qemu IOMMU gives
> GIOVA->GPA mapping, and qemu vhost code will translate GPA to HVA then pass
> GIOVA->HVA to vhost. It looks no difference to me.
> 
> Thanks

The difference is in security not in performance.  Getting a bad HVA
corrupts QEMU memory and it might be guest controlled. Very risky.  If
translations to HVA are done in a single place through a single table
it's safer as there's a single risky place.

-- 
MST
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next 0/3] vhost: accelerate metadata access through vmap()
From: Michael S. Tsirkin @ 2018-12-14 12:52 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <836932fc-9266-b73d-2ee5-645f399e1a54@redhat.com>

On Fri, Dec 14, 2018 at 12:29:54PM +0800, Jason Wang wrote:
> 
> On 2018/12/14 上午4:12, Michael S. Tsirkin wrote:
> > On Thu, Dec 13, 2018 at 06:10:19PM +0800, Jason Wang wrote:
> > > Hi:
> > > 
> > > This series tries to access virtqueue metadata through kernel virtual
> > > address instead of copy_user() friends since they had too much
> > > overheads like checks, spec barriers or even hardware feature
> > > toggling.
> > > 
> > > Test shows about 24% improvement on TX PPS. It should benefit other
> > > cases as well.
> > > 
> > > Please review
> > I think the idea of speeding up userspace access is a good one.
> > However I think that moving all checks to start is way too aggressive.
> 
> 
> So did packet and AF_XDP. Anyway, sharing address space and access them
> directly is the fastest way. Performance is the major consideration for
> people to choose backend. Compare to userspace implementation, vhost does
> not have security advantages at any level. If vhost is still slow, people
> will start to develop backends based on e.g AF_XDP.
> 

Let them what's wrong with that?

> > Instead, let's batch things up but let's not keep them
> > around forever.
> > Here are some ideas:
> > 
> > 
> > 1. Disable preemption, process a small number of small packets
> >     directly in an atomic context. This should cut latency
> >     down significantly, the tricky part is to only do it
> >     on a light load and disable this
> >     for the streaming case otherwise it's unfair.
> >     This might fail, if it does just bounce things out to
> >     a thread.
> 
> 
> I'm not sure what context you meant here. Is this for TX path of TUN? But a
> fundamental difference is my series is targeted for extreme heavy load not
> light one, 100% cpu for vhost is expected.

Interesting. You only shared a TCP RR result though.
What's the performance gain in a heavy load case?

> 
> > 
> > 2. Switch to unsafe_put_user/unsafe_get_user,
> >     and batch up multiple accesses.
> 
> 
> As I said, unless we can batch accessing of two difference places of three
> of avail, descriptor and used. It won't help for batching the accessing of a
> single place like used. I'm even not sure this can be done consider the case
> of packed virtqueue, we have a single descriptor ring.

So that's one of the reasons packed should be faster. Single access
and you get the descriptor no messy redirects. Somehow your
benchmarking so far didn't show a gain with vhost and
packed though - do you know what's wrong?

> Batching through
> unsafe helpers may not help in this case since it's equivalent to safe ones
> . And This requires non trivial refactoring of vhost. And such refactoring
> itself make give us noticeable impact (e.g it may lead regression).
> 
> 
> > 
> > 3. Allow adding a fixup point manually,
> >     such that multiple independent get_user accesses
> >     can get a single fixup (will allow better compiler
> >     optimizations).
> > 
> 
> So for metadata access, I don't see how you suggest here can help in the
> case of heavy workload.
> 
> For data access, this may help but I've played to batch the data copy to
> reduce SMAP/spec barriers in vhost-net but I don't see performance
> improvement.
> 
> Thanks

So how about we try to figure what's going on actually?
Can you drop the barriers and show the same gain?
E.g. vmap does not use a huge page IIRC so in fact it
can be slower than direct access. It's not a magic
faster way.



> 
> > 
> > 
> > 
> > > Jason Wang (3):
> > >    vhost: generalize adding used elem
> > >    vhost: fine grain userspace memory accessors
> > >    vhost: access vq metadata through kernel virtual address
> > > 
> > >   drivers/vhost/vhost.c | 281 ++++++++++++++++++++++++++++++++++++++----
> > >   drivers/vhost/vhost.h |  11 ++
> > >   2 files changed, 266 insertions(+), 26 deletions(-)
> > > 
> > > -- 
> > > 2.17.1
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next 3/3] vhost: access vq metadata through kernel virtual address
From: Michael S. Tsirkin @ 2018-12-14 12:36 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <d70c5ac1-c805-cefd-ace1-643a2e271ba0@redhat.com>

On Fri, Dec 14, 2018 at 11:57:35AM +0800, Jason Wang wrote:
> 
> On 2018/12/13 下午11:44, Michael S. Tsirkin wrote:
> > On Thu, Dec 13, 2018 at 06:10:22PM +0800, Jason Wang wrote:
> > > It was noticed that the copy_user() friends that was used to access
> > > virtqueue metdata tends to be very expensive for dataplane
> > > implementation like vhost since it involves lots of software check,
> > > speculation barrier, hardware feature toggling (e.g SMAP). The
> > > extra cost will be more obvious when transferring small packets.
> > > 
> > > This patch tries to eliminate those overhead by pin vq metadata pages
> > > and access them through vmap(). During SET_VRING_ADDR, we will setup
> > > those mappings and memory accessors are modified to use pointers to
> > > access the metadata directly.
> > > 
> > > Note, this was only done when device IOTLB is not enabled. We could
> > > use similar method to optimize it in the future.
> > > 
> > > Tests shows about ~24% improvement on TX PPS when using virtio-user +
> > > vhost_net + xdp1 on TAP (CONFIG_HARDENED_USERCOPY is not enabled):
> > > 
> > > Before: ~5.0Mpps
> > > After:  ~6.1Mpps
> > > 
> > > Signed-off-by: Jason Wang<jasowang@redhat.com>
> > > ---
> > >   drivers/vhost/vhost.c | 178 ++++++++++++++++++++++++++++++++++++++++++
> > >   drivers/vhost/vhost.h |  11 +++
> > >   2 files changed, 189 insertions(+)
> > > 
> > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > > index bafe39d2e637..1bd24203afb6 100644
> > > --- a/drivers/vhost/vhost.c
> > > +++ b/drivers/vhost/vhost.c
> > > @@ -443,6 +443,9 @@ void vhost_dev_init(struct vhost_dev *dev,
> > >   		vq->indirect = NULL;
> > >   		vq->heads = NULL;
> > >   		vq->dev = dev;
> > > +		memset(&vq->avail_ring, 0, sizeof(vq->avail_ring));
> > > +		memset(&vq->used_ring, 0, sizeof(vq->used_ring));
> > > +		memset(&vq->desc_ring, 0, sizeof(vq->desc_ring));
> > >   		mutex_init(&vq->mutex);
> > >   		vhost_vq_reset(dev, vq);
> > >   		if (vq->handle_kick)
> > > @@ -614,6 +617,102 @@ static void vhost_clear_msg(struct vhost_dev *dev)
> > >   	spin_unlock(&dev->iotlb_lock);
> > >   }
> > > +static int vhost_init_vmap(struct vhost_vmap *map, unsigned long uaddr,
> > > +			   size_t size, int write)
> > > +{
> > > +	struct page **pages;
> > > +	int npages = DIV_ROUND_UP(size, PAGE_SIZE);
> > > +	int npinned;
> > > +	void *vaddr;
> > > +
> > > +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
> > > +	if (!pages)
> > > +		return -ENOMEM;
> > > +
> > > +	npinned = get_user_pages_fast(uaddr, npages, write, pages);
> > > +	if (npinned != npages)
> > > +		goto err;
> > > +
> > As I said I have doubts about the whole approach, but this
> > implementation in particular isn't a good idea
> > as it keeps the page around forever.
> > So no THP, no NUMA rebalancing,
> 
> 
> This is the price of all GUP users not only vhost itself.

Yes. GUP is just not a great interface for vhost to use.

> What's more
> important, the goal is not to be left too much behind for other backends
> like DPDK or AF_XDP (all of which are using GUP).


So these guys assume userspace knows what it's doing.
We can't assume that.

> 
> > userspace-controlled
> > amount of memory locked up and not accounted for.
> 
> 
> It's pretty easy to add this since the slow path was still kept. If we
> exceeds the limitation, we can switch back to slow path.
> 
> > 
> > Don't get me wrong it's a great patch in an ideal world.
> > But then in an ideal world no barriers smap etc are necessary at all.
> 
> 
> Again, this is only for metadata accessing not the data which has been used
> for years for real use cases.
> 
> For SMAP, it makes senses for the address that kernel can not forcast. But
> it's not the case for the vhost metadata since we know the address will be
> accessed very frequently. For speculation barrier, it helps nothing for the
> data path of vhost which is a kthread.

I don't see how a kthread makes any difference. We do have a validation
step which makes some difference.

> Packet or AF_XDP benefit from
> accessing metadata directly, we should do it as well.
> 
> Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH net-next 0/3] vhost: accelerate metadata access through vmap()
From: Michael S. Tsirkin @ 2018-12-14 12:33 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <9459e227-a943-8553-732b-d7f5225a0f22@redhat.com>

On Fri, Dec 14, 2018 at 11:42:18AM +0800, Jason Wang wrote:
> 
> On 2018/12/13 下午11:27, Michael S. Tsirkin wrote:
> > On Thu, Dec 13, 2018 at 06:10:19PM +0800, Jason Wang wrote:
> > > Hi:
> > > 
> > > This series tries to access virtqueue metadata through kernel virtual
> > > address instead of copy_user() friends since they had too much
> > > overheads like checks, spec barriers or even hardware feature
> > > toggling.
> > Userspace accesses through remapping tricks and next time there's a need
> > for a new barrier we are left to figure it out by ourselves.
> 
> 
> I don't get here, do you mean spec barriers?

I mean the next barrier people decide to put into userspace
memory accesses.

> It's completely unnecessary for
> vhost which is kernel thread.

It's defence in depth. Take a look at the commit that added them.
And yes quite possibly in most cases we actually have a spec
barrier in the validation phase. If we do let's use the
unsafe variants so they can be found.

> And even if you're right, vhost is not the
> only place, there's lots of vmap() based accessing in kernel.

For sure. But if one can get by without get user pages, one
really should. Witness recently uncovered mess with file
backed storage.

> Think in
> another direction, this means we won't suffer form unnecessary barriers for
> kthread like vhost in the future, we will manually pick the one we really
> need

I personally think we should err on the side of caution not on the side of
performance.

> (but it should have little possibility).

History seems to teach otherwise.

> Please notice we only access metdata through remapping not the data itself.
> This idea has been used for high speed userspace backend for years, e.g
> packet socket or recent AF_XDP.

I think their justification for the higher risk is that they are mostly
designed for priveledged userspace.

> The only difference is the page was remap to
> from kernel to userspace.

At least that avoids the g.u.p mess.

> 
> >    I don't
> > like the idea I have to say.  As a first step, why don't we switch to
> > unsafe_put_user/unsafe_get_user etc?
> 
> 
> Several reasons:
> 
> - They only have x86 variant, it won't have any difference for the rest of
> architecture.

Is there an issue on other architectures? If yes they can be extended
there.

> - unsafe_put_user/unsafe_get_user is not sufficient for accessing structures
> (e.g accessing descriptor) or arrays (batching).

So you want unsafe_copy_xxx_user? I can do this. Hang on will post.

> - Unless we can batch at least the accessing of two places in three of
> avail, used and descriptor in one run. There will be no difference. E.g we
> can batch updating used ring, but it won't make any difference in this case.
> 

So let's batch them all?


> > That would be more of an apples to apples comparison, would it not?
> 
> 
> Apples to apples comparison only help if we are the No.1. But the fact is we
> are not. If we want to compete with e.g dpdk or AF_XDP, vmap() is the
> fastest method AFAIK.
> 
> 
> Thanks

We need to speed up the packet access itself too though.
You can't vmap all of guest memory.


> 
> > 
> > 
> > > Test shows about 24% improvement on TX PPS. It should benefit other
> > > cases as well.
> > > 
> > > Please review
> > > 
> > > Jason Wang (3):
> > >    vhost: generalize adding used elem
> > >    vhost: fine grain userspace memory accessors
> > >    vhost: access vq metadata through kernel virtual address
> > > 
> > >   drivers/vhost/vhost.c | 281 ++++++++++++++++++++++++++++++++++++++----
> > >   drivers/vhost/vhost.h |  11 ++
> > >   2 files changed, 266 insertions(+), 26 deletions(-)
> > > 
> > > -- 
> > > 2.17.1
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH v2 0/5] VSOCK: support mergeable rx buffer in vhost-vsock
From: jiangyiwen @ 2018-12-14 10:24 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, kvm, Stefan Hajnoczi, virtualization
In-Reply-To: <20181212100835-mutt-send-email-mst@kernel.org>

On 2018/12/12 23:09, Michael S. Tsirkin wrote:
> On Wed, Dec 12, 2018 at 05:25:50PM +0800, jiangyiwen wrote:
>> Now vsock only support send/receive small packet, it can't achieve
>> high performance. As previous discussed with Jason Wang, I revisit the
>> idea of vhost-net about mergeable rx buffer and implement the mergeable
>> rx buffer in vhost-vsock, it can allow big packet to be scattered in
>> into different buffers and improve performance obviously.
>>
>> This series of patches mainly did three things：
>> - mergeable buffer implementation
>> - increase the max send pkt size
>> - add used and signal guest in a batch
>>
>> And I write a tool to test the vhost-vsock performance, mainly send big
>> packet(64K) included guest->Host and Host->Guest. I test performance
>> independently and the result as follows:
>>
>> Before performance:
>>               Single socket            Multiple sockets(Max Bandwidth)
>> Guest->Host   ~400MB/s                 ~480MB/s
>> Host->Guest   ~1450MB/s                ~1600MB/s
>>
>> After performance only use implement mergeable rx buffer:
>>               Single socket            Multiple sockets(Max Bandwidth)
>> Guest->Host   ~400MB/s                 ~480MB/s
>> Host->Guest   ~1280MB/s                ~1350MB/s
>>
>> In this case, max send pkt size is still limited to 4K, so Host->Guest
>> performance will worse than before.
> 
> It's concerning though, what if application sends small packets?
> What is the source of the slowdown? Do you know?
> 

Hi Michael,

To the two cases, I test the results included small and big packets as
follows:

64K packets performance comparison:
                                              Single socket    Multiple sockets
Host->Guest(before)                           1352.60MB/s      1436.33MB/s


Host->Guest(only use mergeable rx buffer)     1290.08MB/s      1212.67MB/s

4K packets performance comparison:
                                              Single socket    Multiple sockets
Host->Guest(before)                           535.47MB/s       688.67MB/s
Host->Guest(only use mergeable rx buffer)     522.33MB/s       599.00MB/s

3K packets performance comparison:
                                              Single socket    Multiple sockets
Host->Guest(before)                           359.74MB/s       442.00MB/s
Host->Guest(only use mergeable rx buffer)     374.47MB/s       452.33MB/s

We can see an interesting thing, for 64K and 4K packets,
using mergeable buffer has a poor performance, for 3K packet,
both have the same performance.

I guess in mergeable mode, when host send a 4k packet to guest, we
should call vhost_get_vq_desc() twice in host(hdr + 4k data),
and in guest we also should call virtqueue_get_buf() twice. So
when packet is smaller than (4k - hdr), it can be packed in a
single page, so the performance is the same as before.

So in the mergeable mode, the performance may be
worse in ((4k - hdr), 4k] than before.

Thanks,
Yiwen.

>> After performance increase the max send pkt size to 64K:
>>               Single socket            Multiple sockets(Max Bandwidth)
>> Guest->Host   ~1700MB/s                ~2900MB/s
>> Host->Guest   ~1500MB/s                ~2440MB/s
>>
>> After performance all patches are used:
>>               Single socket            Multiple sockets(Max Bandwidth)
>> Guest->Host   ~1700MB/s                ~2900MB/s
>> Host->Guest   ~1700MB/s                ~2900MB/s
>>
>> >From the test results, the performance is improved obviously, and guest
>> memory will not be wasted.
>>
>> In addition, in order to support mergeable rx buffer in virtio-vsock,
>> we need to add a qemu patch to support parse feature.
>>
>> ---
>> v1 -> v2:
>>  * Addressed comments from Jason Wang.
>>  * Add performance test result independently.
>>  * Use Skb_page_frag_refill() which can use high order page and reduce
>>    the stress of page allocator.
>>  * Still use fixed size(PAGE_SIZE) to fill rx buffer, because too small
>>    size can't fill one full packet, we only 128 vq num now.
>>  * Use iovec to replace buf in struct virtio_vsock_pkt, keep tx and rx
>>    consistency.
>>  * Add virtio_transport ops to get max pkt len, in order to be compatible
>>    with old version.
>> ---
>>
>> Yiwen Jiang (5):
>>   VSOCK: support fill mergeable rx buffer in guest
>>   VSOCK: support fill data to mergeable rx buffer in host
>>   VSOCK: support receive mergeable rx buffer in guest
>>   VSOCK: increase send pkt len in mergeable mode to improve performance
>>   VSOCK: batch sending rx buffer to increase bandwidth
>>
>>  drivers/vhost/vsock.c                   | 183 ++++++++++++++++++++-----
>>  include/linux/virtio_vsock.h            |  13 +-
>>  include/uapi/linux/virtio_vsock.h       |   5 +
>>  net/vmw_vsock/virtio_transport.c        | 229 +++++++++++++++++++++++++++-----
>>  net/vmw_vsock/virtio_transport_common.c |  66 ++++++---
>>  5 files changed, 411 insertions(+), 85 deletions(-)
>>
>> -- 
>> 1.8.3.1
> 
> .
> 


_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH v2 0/5] VSOCK: support mergeable rx buffer in vhost-vsock
From: jiangyiwen @ 2018-12-14  9:39 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: netdev, virtualization, kvm, Michael S. Tsirkin
In-Reply-To: <20181213163409.GP23318@stefanha-x1.localdomain>

On 2018/12/14 0:34, Stefan Hajnoczi wrote:
> On Wed, Dec 12, 2018 at 05:25:50PM +0800, jiangyiwen wrote:
>> Now vsock only support send/receive small packet, it can't achieve
>> high performance. As previous discussed with Jason Wang, I revisit the
>> idea of vhost-net about mergeable rx buffer and implement the mergeable
>> rx buffer in vhost-vsock, it can allow big packet to be scattered in
>> into different buffers and improve performance obviously.
> 
> Sorry, I've been a bad maintainer.  I was focussed on other projects and
> my email backlog is huge.
> 
> I like the idea of trying out optimizations on virtio-vsock, seeing if
> code can be shared with virtio-net, and maybe later switching to a
> virtio-net transport for vsock (if it turns out enough code can be
> shared).
> 
> Another optimization that could be interesting:
> 
> Userspace processes reading from a socket sleep in
> vsock_stream_recvmsg().  I wonder if we can bypass struct
> virtio_vsock_pkt and copying the payload into pkt->buf in this case.
> (This doesn't improve poll(2)/select(2) though!)
> 
> Imagine a userspace process waiting for data on a socket.  When the
> virtqueue becomes ready, we can read in struct virtio_vsock_hdr and find
> the socket for that connection.  Then we could copy the payload directly
> to userspace instead of creating a virtio_vsock_pkt and copying to
> pkt->buf first.
> 

Great, I also consider the optimization point later.
Then, I will send the next version based on your suggestions.

Thanks,
Yiwen.

^ permalink raw reply

* Re: [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest
From: jiangyiwen @ 2018-12-14  8:41 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: netdev, virtualization, kvm, Michael S. Tsirkin
In-Reply-To: <20181213162039.GO23318@stefanha-x1.localdomain>

On 2018/12/14 0:20, Stefan Hajnoczi wrote:
> On Wed, Dec 12, 2018 at 05:31:39PM +0800, jiangyiwen wrote:
>> +static struct virtio_vsock_pkt *receive_mergeable(struct virtqueue *vq,
>> +		struct virtio_vsock *vsock, unsigned int *total_len)
>> +{
>> +	struct virtio_vsock_pkt *pkt;
>> +	u16 num_buf;
>> +	void *buf;
>> +	unsigned int len;
>> +	size_t vsock_hlen = sizeof(struct virtio_vsock_pkt);
>> +
>> +	buf = virtqueue_get_buf(vq, &len);
>> +	if (!buf)
>> +		return NULL;
>> +
>> +	*total_len = len;
>> +	vsock->rx_buf_nr--;
>> +
>> +	if (unlikely(len < vsock_hlen)) {
>> +		put_page(virt_to_head_page(buf));
>> +		return NULL;
>> +	}
>> +
>> +	pkt = buf;
>> +	num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
>> +	if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_VEC_NUM) {
>> +		put_page(virt_to_head_page(buf));
>> +		return NULL;
>> +	}
>> +
>> +	/* Initialize pkt residual structure */
>> +	memset(&pkt->work, 0, vsock_hlen - sizeof(struct virtio_vsock_hdr) -
>> +			sizeof(struct virtio_vsock_mrg_rxbuf_hdr));
> 
> struct virtio_vsock_pkt is an internal driver state structure.  It must
> be able to change without breaking the host<->guest device interface.
> Exposing struct virtio_vsock_pkt across the device interface makes this
> impossible.
> 
> One way to solve this is by placing a header length field at the
> beginning of the mergeable buffer.  That way the device knows at which
> offset the payload should be placed and the driver can continue to use
> the allocation scheme you've chosen (where a single page contains the
> virtio_vsock_pkt and payload).
> 
> Another issue is that virtio requires exclusive read OR write
> descriptors.  You cannot have a descriptor that is both read and write.
> So there's not a huge advantage to combining the driver state struct and
> payload, except maybe the driver can save a memory allocation.
> 

Right, I will separate pkt and payload in different pages, as you mentioned
in another patch.

Thanks,
Yiwen.

^ permalink raw reply

* Re: [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest
From: jiangyiwen @ 2018-12-14  8:11 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, kvm, Stefan Hajnoczi, virtualization
In-Reply-To: <20181213092619-mutt-send-email-mst@kernel.org>

On 2018/12/13 22:29, Michael S. Tsirkin wrote:
> On Thu, Dec 13, 2018 at 10:38:09AM +0800, jiangyiwen wrote:
>> Hi Michael,
>>
>> On 2018/12/12 23:31, Michael S. Tsirkin wrote:
>>> On Wed, Dec 12, 2018 at 05:31:39PM +0800, jiangyiwen wrote:
>>>> Guest receive mergeable rx buffer, it can merge
>>>> scatter rx buffer into a big buffer and then copy
>>>> to user space.
>>>>
>>>> In addition, it also use iovec to replace buf in struct
>>>> virtio_vsock_pkt, keep tx and rx consistency. The only
>>>> difference is now tx still uses a segment of continuous
>>>> physical memory to implement.
>>>>
>>>> Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
>>>> ---
>>>>  drivers/vhost/vsock.c                   |  31 +++++++---
>>>>  include/linux/virtio_vsock.h            |   6 +-
>>>>  net/vmw_vsock/virtio_transport.c        | 105 ++++++++++++++++++++++++++++----
>>>>  net/vmw_vsock/virtio_transport_common.c |  59 ++++++++++++++----
>>>>  4 files changed, 166 insertions(+), 35 deletions(-)
>>>
>>>
>>> This was supposed to be a guest patch, why is vhost changed here?
>>>
>>
>> In mergeable rx buff cases, it need to scatter big packets into several
>> buffers, so I add kvec variable in struct virtio_vsock_pkt, at the same
>> time, in order to keep tx and rx consistency, I use kvec to replace
>> variable buf, because vhost use the variable pkt->buf, so this patch
>> caused vhost is changed.
> 
> You'd want to split these patches imho.
> 

Ok, I should do it.

>>>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>>> index dc52b0f..c7ab0dd 100644
>>>> --- a/drivers/vhost/vsock.c
>>>> +++ b/drivers/vhost/vsock.c
>>>> @@ -179,6 +179,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>>>>  		size_t nbytes;
>>>>  		size_t len;
>>>>  		s16 headcount;
>>>> +		size_t remain_len;
>>>> +		int i;
>>>>
>>>>  		spin_lock_bh(&vsock->send_pkt_list_lock);
>>>>  		if (list_empty(&vsock->send_pkt_list)) {
>>>> @@ -221,11 +223,19 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>>>>  			break;
>>>>  		}
>>>>
>>>> -		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
>>>> -		if (nbytes != pkt->len) {
>>>> -			virtio_transport_free_pkt(pkt);
>>>> -			vq_err(vq, "Faulted on copying pkt buf\n");
>>>> -			break;
>>>> +		remain_len = pkt->len;
>>>> +		for (i = 0; i < pkt->nr_vecs; i++) {
>>>> +			int tmp_len;
>>>> +
>>>> +			tmp_len = min(remain_len, pkt->vec[i].iov_len);
>>>> +			nbytes = copy_to_iter(pkt->vec[i].iov_base, tmp_len, &iov_iter);
>>>> +			if (nbytes != tmp_len) {
>>>> +				virtio_transport_free_pkt(pkt);
>>>> +				vq_err(vq, "Faulted on copying pkt buf\n");
>>>> +				break;
>>>> +			}
>>>> +
>>>> +			remain_len -= tmp_len;
>>>>  		}
>>>>
>>>>  		vhost_add_used_n(vq, vq->heads, headcount);
>>>> @@ -341,6 +351,7 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
>>>>  	struct iov_iter iov_iter;
>>>>  	size_t nbytes;
>>>>  	size_t len;
>>>> +	void *buf;
>>>>
>>>>  	if (in != 0) {
>>>>  		vq_err(vq, "Expected 0 input buffers, got %u\n", in);
>>>> @@ -375,13 +386,17 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
>>>>  		return NULL;
>>>>  	}
>>>>
>>>> -	pkt->buf = kmalloc(pkt->len, GFP_KERNEL);
>>>> -	if (!pkt->buf) {
>>>> +	buf = kmalloc(pkt->len, GFP_KERNEL);
>>>> +	if (!buf) {
>>>>  		kfree(pkt);
>>>>  		return NULL;
>>>>  	}
>>>>
>>>> -	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
>>>> +	pkt->vec[0].iov_base = buf;
>>>> +	pkt->vec[0].iov_len = pkt->len;
>>>> +	pkt->nr_vecs = 1;
>>>> +
>>>> +	nbytes = copy_from_iter(buf, pkt->len, &iov_iter);
>>>>  	if (nbytes != pkt->len) {
>>>>  		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
>>>>  		       pkt->len, nbytes);
>>>> diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
>>>> index da9e1fe..734eeed 100644
>>>> --- a/include/linux/virtio_vsock.h
>>>> +++ b/include/linux/virtio_vsock.h
>>>> @@ -13,6 +13,8 @@
>>>>  #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
>>>>  #define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
>>>>  #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
>>>> +/* virtio_vsock_pkt + max_pkt_len(default MAX_PKT_BUF_SIZE) */
>>>> +#define VIRTIO_VSOCK_MAX_VEC_NUM ((VIRTIO_VSOCK_MAX_PKT_BUF_SIZE / PAGE_SIZE) + 1)
>>>>
>>>>  /* Virtio-vsock feature */
>>>>  #define VIRTIO_VSOCK_F_MRG_RXBUF 0 /* Host can merge receive buffers. */
>>>> @@ -55,10 +57,12 @@ struct virtio_vsock_pkt {
>>>>  	struct list_head list;
>>>>  	/* socket refcnt not held, only use for cancellation */
>>>>  	struct vsock_sock *vsk;
>>>> -	void *buf;
>>>> +	struct kvec vec[VIRTIO_VSOCK_MAX_VEC_NUM];
>>>> +	int nr_vecs;
>>>>  	u32 len;
>>>>  	u32 off;
>>>>  	bool reply;
>>>> +	bool mergeable;
>>>>  };
>>>>
>>>>  struct virtio_vsock_pkt_info {
>>>> diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
>>>> index c4a465c..148b58a 100644
>>>> --- a/net/vmw_vsock/virtio_transport.c
>>>> +++ b/net/vmw_vsock/virtio_transport.c
>>>> @@ -155,8 +155,10 @@ static int virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
>>>>
>>>>  		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>>>>  		sgs[out_sg++] = &hdr;
>>>> -		if (pkt->buf) {
>>>> -			sg_init_one(&buf, pkt->buf, pkt->len);
>>>> +		if (pkt->len) {
>>>> +			/* Currently only support a segment of memory in tx */
>>>> +			BUG_ON(pkt->vec[0].iov_len != pkt->len);
>>>> +			sg_init_one(&buf, pkt->vec[0].iov_base, pkt->vec[0].iov_len);
>>>>  			sgs[out_sg++] = &buf;
>>>>  		}
>>>>
>>>> @@ -304,23 +306,28 @@ static int fill_old_rx_buff(struct virtqueue *vq)
>>>>  	struct virtio_vsock_pkt *pkt;
>>>>  	struct scatterlist hdr, buf, *sgs[2];
>>>>  	int ret;
>>>> +	void *pkt_buf;
>>>>
>>>>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
>>>>  	if (!pkt)
>>>>  		return -ENOMEM;
>>>>
>>>> -	pkt->buf = kmalloc(buf_len, GFP_KERNEL);
>>>> -	if (!pkt->buf) {
>>>> +	pkt_buf = kmalloc(buf_len, GFP_KERNEL);
>>>> +	if (!pkt_buf) {
>>>>  		virtio_transport_free_pkt(pkt);
>>>>  		return -ENOMEM;
>>>>  	}
>>>>
>>>> +	pkt->vec[0].iov_base = pkt_buf;
>>>> +	pkt->vec[0].iov_len = buf_len;
>>>> +	pkt->nr_vecs = 1;
>>>> +
>>>>  	pkt->len = buf_len;
>>>>
>>>>  	sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>>>>  	sgs[0] = &hdr;
>>>>
>>>> -	sg_init_one(&buf, pkt->buf, buf_len);
>>>> +	sg_init_one(&buf, pkt->vec[0].iov_base, buf_len);
>>>>  	sgs[1] = &buf;
>>>>  	ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
>>>>  	if (ret)
>>>> @@ -388,11 +395,78 @@ static bool virtio_transport_more_replies(struct virtio_vsock *vsock)
>>>>  	return val < virtqueue_get_vring_size(vq);
>>>>  }
>>>>
>>>> +static struct virtio_vsock_pkt *receive_mergeable(struct virtqueue *vq,
>>>> +		struct virtio_vsock *vsock, unsigned int *total_len)
>>>> +{
>>>> +	struct virtio_vsock_pkt *pkt;
>>>> +	u16 num_buf;
>>>> +	void *buf;
>>>> +	unsigned int len;
>>>> +	size_t vsock_hlen = sizeof(struct virtio_vsock_pkt);
>>>> +
>>>> +	buf = virtqueue_get_buf(vq, &len);
>>>> +	if (!buf)
>>>> +		return NULL;
>>>> +
>>>> +	*total_len = len;
>>>> +	vsock->rx_buf_nr--;
>>>> +
>>>> +	if (unlikely(len < vsock_hlen)) {
>>>> +		put_page(virt_to_head_page(buf));
>>>> +		return NULL;
>>>> +	}
>>>> +
>>>> +	pkt = buf;
>>>> +	num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
>>>> +	if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_VEC_NUM) {
>>>> +		put_page(virt_to_head_page(buf));
>>>> +		return NULL;
>>>> +	}
>>>
>>> So everything just stops going, and host and user don't even
>>> know what the reason is. And not only that - the next
>>> packet will be corrupted because we skipped the first one.
>>>
>>>
>>
>> I understand this case will not encountered unless the code has
>> *BUG*, like Host send some problematic packages (shorten/longer than
>> expected). In this case, I think we should ignore/drop these packets.
> 
> If there's a specific packet length expected, e.g.  in this case vector
> size, it needs to be negotiated between host and guest in some way.
> 

Actually, I still keep the negotiated in the virtio_transport_rx_work(),
in the receive_mergeable() I will calculate total len, and compared with
the pkt->hdr.len(host packed).

>>>
>>>> +
>>>> +	/* Initialize pkt residual structure */
>>>> +	memset(&pkt->work, 0, vsock_hlen - sizeof(struct virtio_vsock_hdr) -
>>>> +			sizeof(struct virtio_vsock_mrg_rxbuf_hdr));
>>>> +
>>>> +	pkt->mergeable = true;
>>>> +	pkt->len = le32_to_cpu(pkt->hdr.len);
>>>> +	if (!pkt->len)
>>>> +		return pkt;
>>>> +
>>>> +	len -= vsock_hlen;
>>>> +	if (len) {
>>>> +		pkt->vec[pkt->nr_vecs].iov_base = buf + vsock_hlen;
>>>> +		pkt->vec[pkt->nr_vecs].iov_len = len;
>>>> +		/* Shared page with pkt, so get page in advance */
>>>> +		get_page(virt_to_head_page(buf));
>>>> +		pkt->nr_vecs++;
>>>> +	}
>>>> +
>>>> +	while (--num_buf) {
>>>> +		buf = virtqueue_get_buf(vq, &len);
>>>> +		if (!buf)
>>>> +			goto err;
>>>> +
>>>> +		*total_len += len;
>>>> +		vsock->rx_buf_nr--;
>>>> +
>>>> +		pkt->vec[pkt->nr_vecs].iov_base = buf;
>>>> +		pkt->vec[pkt->nr_vecs].iov_len = len;
>>>> +		pkt->nr_vecs++;
>>>> +	}
>>>> +
>>>> +	return pkt;
>>>> +err:
>>>> +	virtio_transport_free_pkt(pkt);
>>>> +	return NULL;
>>>> +}
>>>> +
>>>>  static void virtio_transport_rx_work(struct work_struct *work)
>>>>  {
>>>>  	struct virtio_vsock *vsock =
>>>>  		container_of(work, struct virtio_vsock, rx_work);
>>>>  	struct virtqueue *vq;
>>>> +	size_t vsock_hlen = vsock->mergeable ? sizeof(struct virtio_vsock_pkt) :
>>>> +			sizeof(struct virtio_vsock_hdr);
>>>>
>>>>  	vq = vsock->vqs[VSOCK_VQ_RX];
>>>>
>>>> @@ -412,21 +486,26 @@ static void virtio_transport_rx_work(struct work_struct *work)
>>>>  				goto out;
>>>>  			}
>>>>
>>>> -			pkt = virtqueue_get_buf(vq, &len);
>>>> -			if (!pkt) {
>>>> -				break;
>>>> -			}
>>>> +			if (likely(vsock->mergeable)) {
>>>> +				pkt = receive_mergeable(vq, vsock, &len);
>>>> +				if (!pkt)
>>>> +					break;
>>>> +			} else {
>>>> +				pkt = virtqueue_get_buf(vq, &len);
>>>> +				if (!pkt)
>>>> +					break;
>>>>
>>>
>>> So looking at it, this seems to be the main source of the gain.
>>> But why does this require host/guest changes?
>>>
>>>
>>> The way I see it:
>>> 	- get a buffer and create an skb
>>> 	- get the next one, check header matches, if yes
>>> 	  tack it on the skb as a fragment. If not then
>>> 	  don't, deliver previous one and queue the new one.
>>>
>>>
>>
>> Vhost change reason I explain as above, and I hope use kvec
>> to instead buf, after all buf only can express a contiguous
>> physical memory.
>>
>> Thanks,
>> Yiwen.
> 
> 
> I got the reason but I am not yet convinced it's a good one.
> You don't necessarily need host to skip headers
> in all but the first chunk. Guest can do this just as well.
> 
> 

I understand what you mean, right, only waste some memory, but it
can gain more convenient, like I don't need to skip headers and
host don't need to transport total pkt. I will fix it in the later
version.

Thanks,
Yiwen.

> 
>>>
>>>> -			vsock->rx_buf_nr--;
>>>> +				vsock->rx_buf_nr--;
>>>> +			}
>>>>
>>>>  			/* Drop short/long packets */
>>>> -			if (unlikely(len < sizeof(pkt->hdr) ||
>>>> -				     len > sizeof(pkt->hdr) + pkt->len)) {
>>>> +			if (unlikely(len < vsock_hlen ||
>>>> +				     len > vsock_hlen + pkt->len)) {
>>>>  				virtio_transport_free_pkt(pkt);
>>>>  				continue;
>>>>  			}
>>>>
>>>> -			pkt->len = len - sizeof(pkt->hdr);
>>>> +			pkt->len = len - vsock_hlen;
>>>>  			virtio_transport_deliver_tap_pkt(pkt);
>>>>  			virtio_transport_recv_pkt(pkt);
>>>>  		}
>>>> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>>>> index 3ae3a33..123a8b6 100644
>>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>>> @@ -44,6 +44,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>>>>  {
>>>>  	struct virtio_vsock_pkt *pkt;
>>>>  	int err;
>>>> +	void *buf = NULL;
>>>>
>>>>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
>>>>  	if (!pkt)
>>>> @@ -62,12 +63,16 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>>>>  	pkt->vsk		= info->vsk;
>>>>
>>>>  	if (info->msg && len > 0) {
>>>> -		pkt->buf = kmalloc(len, GFP_KERNEL);
>>>> -		if (!pkt->buf)
>>>> +		buf = kmalloc(len, GFP_KERNEL);
>>>> +		if (!buf)
>>>>  			goto out_pkt;
>>>> -		err = memcpy_from_msg(pkt->buf, info->msg, len);
>>>> +		err = memcpy_from_msg(buf, info->msg, len);
>>>>  		if (err)
>>>>  			goto out;
>>>> +
>>>> +		pkt->vec[0].iov_base = buf;
>>>> +		pkt->vec[0].iov_len = len;
>>>> +		pkt->nr_vecs = 1;
>>>>  	}
>>>>
>>>>  	trace_virtio_transport_alloc_pkt(src_cid, src_port,
>>>> @@ -80,7 +85,7 @@ static const struct virtio_transport *virtio_transport_get_ops(void)
>>>>  	return pkt;
>>>>
>>>>  out:
>>>> -	kfree(pkt->buf);
>>>> +	kfree(buf);
>>>>  out_pkt:
>>>>  	kfree(pkt);
>>>>  	return NULL;
>>>> @@ -92,6 +97,7 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
>>>>  	struct virtio_vsock_pkt *pkt = opaque;
>>>>  	struct af_vsockmon_hdr *hdr;
>>>>  	struct sk_buff *skb;
>>>> +	int i;
>>>>
>>>>  	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len,
>>>>  			GFP_ATOMIC);
>>>> @@ -134,7 +140,8 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
>>>>  	skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));
>>>>
>>>>  	if (pkt->len) {
>>>> -		skb_put_data(skb, pkt->buf, pkt->len);
>>>> +		for (i = 0; i < pkt->nr_vecs; i++)
>>>> +			skb_put_data(skb, pkt->vec[i].iov_base, pkt->vec[i].iov_len);
>>>>  	}
>>>>
>>>>  	return skb;
>>>> @@ -260,6 +267,9 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
>>>>
>>>>  	spin_lock_bh(&vvs->rx_lock);
>>>>  	while (total < len && !list_empty(&vvs->rx_queue)) {
>>>> +		size_t copy_bytes, last_vec_total = 0, vec_off;
>>>> +		int i;
>>>> +
>>>>  		pkt = list_first_entry(&vvs->rx_queue,
>>>>  				       struct virtio_vsock_pkt, list);
>>>>
>>>> @@ -272,14 +282,28 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk,
>>>>  		 */
>>>>  		spin_unlock_bh(&vvs->rx_lock);
>>>>
>>>> -		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
>>>> -		if (err)
>>>> -			goto out;
>>>> +		for (i = 0; i < pkt->nr_vecs; i++) {
>>>> +			if (pkt->off > last_vec_total + pkt->vec[i].iov_len) {
>>>> +				last_vec_total += pkt->vec[i].iov_len;
>>>> +				continue;
>>>> +			}
>>>> +
>>>> +			vec_off = pkt->off - last_vec_total;
>>>> +			copy_bytes = min(pkt->vec[i].iov_len - vec_off, bytes);
>>>> +			err = memcpy_to_msg(msg, pkt->vec[i].iov_base + vec_off,
>>>> +					copy_bytes);
>>>> +			if (err)
>>>> +				goto out;
>>>> +
>>>> +			bytes -= copy_bytes;
>>>> +			pkt->off += copy_bytes;
>>>> +			total += copy_bytes;
>>>> +			last_vec_total += pkt->vec[i].iov_len;
>>>> +			if (!bytes)
>>>> +				break;
>>>> +		}
>>>>
>>>>  		spin_lock_bh(&vvs->rx_lock);
>>>> -
>>>> -		total += bytes;
>>>> -		pkt->off += bytes;
>>>>  		if (pkt->off == pkt->len) {
>>>>  			virtio_transport_dec_rx_pkt(vvs, pkt);
>>>>  			list_del(&pkt->list);
>>>> @@ -1050,8 +1074,17 @@ void virtio_transport_recv_pkt(struct virtio_vsock_pkt *pkt)
>>>>
>>>>  void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
>>>>  {
>>>> -	kfree(pkt->buf);
>>>> -	kfree(pkt);
>>>> +	int i;
>>>> +
>>>> +	if (pkt->mergeable) {
>>>> +		for (i = 0; i < pkt->nr_vecs; i++)
>>>> +			put_page(virt_to_head_page(pkt->vec[i].iov_base));
>>>> +		put_page(virt_to_head_page((void *)pkt));
>>>> +	} else {
>>>> +		for (i = 0; i < pkt->nr_vecs; i++)
>>>> +			kfree(pkt->vec[i].iov_base);
>>>> +		kfree(pkt);
>>>> +	}
>>>>  }
>>>>  EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
>>>>
>>>> -- 
>>>> 1.8.3.1
>>>>
>>>
>>> .
>>>
>>
> 
> .
> 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox