Linux virtualization list
 help / color / mirror / Atom feed
* [PATCH v2 16/27] compiler: Option to add PROVIDE_HIDDEN replacement for weak symbols
From: Thomas Garnier via Virtualization @ 2018-03-13 20:59 UTC (permalink / raw)
  To: Herbert Xu, David S . Miller, Thomas Gleixner, Ingo Molnar,
	H . Peter Anvin, Peter Zijlstra, Josh Poimboeuf,
	Greg Kroah-Hartman, Kate Stewart, Thomas Garnier, Arnd Bergmann,
	Philippe Ombredanne, Arnaldo Carvalho de Melo, Andrey Ryabinin,
	Matthias Kaehlcke, Kees Cook, Tom Lendacky, Kirill A . Shutemov,
	Andy Lutomirski, Dominik Brodowski, Borislav Petkov,
	Borislav Petkov, Raf
  Cc: linux-arch, kvm, linux-pm, x86, linux-doc, linux-kernel,
	virtualization, linux-sparse, linux-crypto, kernel-hardening,
	xen-devel
In-Reply-To: <20180313205945.245105-1-thgarnie@google.com>

Provide an option to have a PROVIDE_HIDDEN (linker script) entry for
each weak symbol. This option solve an error in x86_64 where the linker
optimizes pie generate code to be non-pie because --emit-relocs was used
instead of -pie (to reduce dynamic relocations).

Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
 init/Kconfig            |  7 +++++++
 scripts/link-vmlinux.sh | 14 ++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/init/Kconfig b/init/Kconfig
index c924babc6d47..fe9f9ada4db0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1927,6 +1927,13 @@ config ASN1
 	  inform it as to what tags are to be expected in a stream and what
 	  functions to call on what tags.
 
+config WEAK_PROVIDE_HIDDEN
+	bool
+	help
+	  Generate linker script PROVIDE_HIDDEN entries for all weak symbols. It
+	  allows to prevent non-pie code being replaced by the linker if the
+	  emit-relocs option is used instead of pie (useful for x86_64 pie).
+
 source "kernel/Kconfig.locks"
 
 config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index 08ca08e9105c..c015f5142ecf 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -146,6 +146,17 @@ kallsyms()
 	${CC} ${aflags} -c -o ${2} ${afile}
 }
 
+gen_weak_provide_hidden()
+{
+        if [ -n "${CONFIG_WEAK_PROVIDE_HIDDEN}" ]; then
+                local pattern="s/^\s\+ w \(\w\+\)$/PROVIDE_HIDDEN(\1 = .);/gp"
+                echo -e "SECTIONS {\n. = _end;" > .tmp_vmlinux_hiddenld
+                ${NM} ${1} | sed -n "${pattern}" >> .tmp_vmlinux_hiddenld
+                echo "}" >> .tmp_vmlinux_hiddenld
+                LDFLAGS_vmlinux="${LDFLAGS_vmlinux} -T .tmp_vmlinux_hiddenld"
+        fi
+}
+
 # Create map file with all symbols from ${1}
 # See mksymap for additional details
 mksysmap()
@@ -230,6 +241,9 @@ modpost_link vmlinux.o
 # modpost vmlinux.o to check for section mismatches
 ${MAKE} -f "${srctree}/scripts/Makefile.modpost" vmlinux.o
 
+# Generate weak linker script
+gen_weak_provide_hidden vmlinux.o
+
 kallsymso=""
 kallsyms_vmlinux=""
 if [ -n "${CONFIG_KALLSYMS}" ]; then
-- 
2.16.2.660.g709887971b-goog

^ permalink raw reply related

* [PATCH v2 17/27] x86/relocs: Handle PIE relocations
From: Thomas Garnier via Virtualization @ 2018-03-13 20:59 UTC (permalink / raw)
  To: Herbert Xu, David S . Miller, Thomas Gleixner, Ingo Molnar,
	H . Peter Anvin, Peter Zijlstra, Josh Poimboeuf,
	Greg Kroah-Hartman, Kate Stewart, Thomas Garnier, Arnd Bergmann,
	Philippe Ombredanne, Arnaldo Carvalho de Melo, Andrey Ryabinin,
	Matthias Kaehlcke, Kees Cook, Tom Lendacky, Kirill A . Shutemov,
	Andy Lutomirski, Dominik Brodowski, Borislav Petkov,
	Borislav Petkov, Raf
  Cc: linux-arch, kvm, linux-pm, x86, linux-doc, linux-kernel,
	virtualization, linux-sparse, linux-crypto, kernel-hardening,
	xen-devel
In-Reply-To: <20180313205945.245105-1-thgarnie@google.com>

Change the relocation tool to correctly handle relocations generated by
-fPIE option:

 - Add relocation for each entry of the .got section given the linker does not
   generate R_X86_64_GLOB_DAT on a simple link.
 - Ignore R_X86_64_GOTPCREL.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
 arch/x86/tools/relocs.c | 93 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 92 insertions(+), 1 deletion(-)

diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 220e97841e49..a35cc337f883 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -32,6 +32,7 @@ struct section {
 	Elf_Sym        *symtab;
 	Elf_Rel        *reltab;
 	char           *strtab;
+	Elf_Addr       *got;
 };
 static struct section *secs;
 
@@ -293,6 +294,35 @@ static Elf_Sym *sym_lookup(const char *symname)
 	return 0;
 }
 
+static Elf_Sym *sym_lookup_addr(Elf_Addr addr, const char **name)
+{
+	int i;
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		struct section *sec = &secs[i];
+		long nsyms;
+		Elf_Sym *symtab;
+		Elf_Sym *sym;
+
+		if (sec->shdr.sh_type != SHT_SYMTAB)
+			continue;
+
+		nsyms = sec->shdr.sh_size/sizeof(Elf_Sym);
+		symtab = sec->symtab;
+
+		for (sym = symtab; --nsyms >= 0; sym++) {
+			if (sym->st_value == addr) {
+				if (name) {
+					*name = sym_name(sec->link->strtab,
+							 sym);
+				}
+				return sym;
+			}
+		}
+	}
+	return 0;
+}
+
+
 #if BYTE_ORDER == LITTLE_ENDIAN
 #define le16_to_cpu(val) (val)
 #define le32_to_cpu(val) (val)
@@ -513,6 +543,33 @@ static void read_relocs(FILE *fp)
 	}
 }
 
+static void read_got(FILE *fp)
+{
+	int i;
+	for (i = 0; i < ehdr.e_shnum; i++) {
+		struct section *sec = &secs[i];
+		sec->got = NULL;
+		if (sec->shdr.sh_type != SHT_PROGBITS ||
+		    strcmp(sec_name(i), ".got")) {
+			continue;
+		}
+		sec->got = malloc(sec->shdr.sh_size);
+		if (!sec->got) {
+			die("malloc of %d bytes for got failed\n",
+				sec->shdr.sh_size);
+		}
+		if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) {
+			die("Seek to %d failed: %s\n",
+				sec->shdr.sh_offset, strerror(errno));
+		}
+		if (fread(sec->got, 1, sec->shdr.sh_size, fp)
+		    != sec->shdr.sh_size) {
+			die("Cannot read got: %s\n",
+				strerror(errno));
+		}
+	}
+}
+
 
 static void print_absolute_symbols(void)
 {
@@ -643,6 +700,32 @@ static void add_reloc(struct relocs *r, uint32_t offset)
 	r->offset[r->count++] = offset;
 }
 
+/*
+ * The linker does not generate relocations for the GOT for the kernel.
+ * If a GOT is found, simulate the relocations that should have been included.
+ */
+static void walk_got_table(int (*process)(struct section *sec, Elf_Rel *rel,
+					  Elf_Sym *sym, const char *symname),
+			   struct section *sec)
+{
+	int i;
+	Elf_Addr entry;
+	Elf_Sym *sym;
+	const char *symname;
+	Elf_Rel rel;
+
+	for (i = 0; i < sec->shdr.sh_size/sizeof(Elf_Addr); i++) {
+		entry = sec->got[i];
+		sym = sym_lookup_addr(entry, &symname);
+		if (!sym)
+			die("Could not found got symbol for entry %d\n", i);
+		rel.r_offset = sec->shdr.sh_addr + i * sizeof(Elf_Addr);
+		rel.r_info = ELF_BITS == 64 ? R_X86_64_GLOB_DAT
+			     : R_386_GLOB_DAT;
+		process(sec, &rel, sym, symname);
+	}
+}
+
 static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel,
 			Elf_Sym *sym, const char *symname))
 {
@@ -656,6 +739,8 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel,
 		struct section *sec = &secs[i];
 
 		if (sec->shdr.sh_type != SHT_REL_TYPE) {
+			if (sec->got)
+				walk_got_table(process, sec);
 			continue;
 		}
 		sec_symtab  = sec->link;
@@ -765,6 +850,7 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
 		offset += per_cpu_load_addr;
 
 	switch (r_type) {
+	case R_X86_64_GOTPCREL:
 	case R_X86_64_NONE:
 		/* NONE can be ignored. */
 		break;
@@ -809,7 +895,7 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
 		 * the relocations are processed.
 		 * Make sure that the offset will fit.
 		 */
-		if ((int32_t)offset != (int64_t)offset)
+		if (r_type != R_X86_64_64 && (int32_t)offset != (int64_t)offset)
 			die("Relocation offset doesn't fit in 32 bits\n");
 
 		if (r_type == R_X86_64_64)
@@ -818,6 +904,10 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
 			add_reloc(&relocs32, offset);
 		break;
 
+	case R_X86_64_GLOB_DAT:
+		add_reloc(&relocs64, offset);
+		break;
+
 	default:
 		die("Unsupported relocation type: %s (%d)\n",
 		    rel_type(r_type), r_type);
@@ -1087,6 +1177,7 @@ void process(FILE *fp, int use_real_mode, int as_text,
 	read_strtabs(fp);
 	read_symtabs(fp);
 	read_relocs(fp);
+	read_got(fp);
 	if (ELF_BITS == 64)
 		percpu_init();
 	if (show_absolute_syms) {
-- 
2.16.2.660.g709887971b-goog

^ permalink raw reply related

* [PATCH v2 18/27] xen: Adapt assembly for PIE support
From: Thomas Garnier via Virtualization @ 2018-03-13 20:59 UTC (permalink / raw)
  To: Herbert Xu, David S . Miller, Thomas Gleixner, Ingo Molnar,
	H . Peter Anvin, Peter Zijlstra, Josh Poimboeuf,
	Greg Kroah-Hartman, Kate Stewart, Thomas Garnier, Arnd Bergmann,
	Philippe Ombredanne, Arnaldo Carvalho de Melo, Andrey Ryabinin,
	Matthias Kaehlcke, Kees Cook, Tom Lendacky, Kirill A . Shutemov,
	Andy Lutomirski, Dominik Brodowski, Borislav Petkov,
	Borislav Petkov, Raf
  Cc: linux-arch, kvm, linux-pm, x86, linux-doc, linux-kernel,
	virtualization, linux-sparse, linux-crypto, kernel-hardening,
	xen-devel
In-Reply-To: <20180313205945.245105-1-thgarnie@google.com>

Change the assembly code to use the new _ASM_MOVABS macro which get a
symbol reference while being PIE compatible. Adapt the relocation tool
to ignore 32-bit Xen code.

Position Independent Executable (PIE) support will allow to extended the
KASLR randomization range below the -2G memory limit.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
 arch/x86/tools/relocs.c | 16 +++++++++++++++-
 arch/x86/xen/xen-head.S | 11 ++++++-----
 arch/x86/xen/xen-pvh.S  | 13 +++++++++----
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index a35cc337f883..29283ad3950f 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -832,6 +832,16 @@ static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
 		strncmp(symname, "init_per_cpu_", 13);
 }
 
+/*
+ * Check if the 32-bit relocation is within the xenpvh 32-bit code.
+ * If so, ignores it.
+ */
+static int is_in_xenpvh_assembly(ElfW(Addr) offset)
+{
+	ElfW(Sym) *sym = sym_lookup("pvh_start_xen");
+	return sym && (offset >= sym->st_value) &&
+		(offset < (sym->st_value + sym->st_size));
+}
 
 static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
 		      const char *symname)
@@ -895,8 +905,12 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym,
 		 * the relocations are processed.
 		 * Make sure that the offset will fit.
 		 */
-		if (r_type != R_X86_64_64 && (int32_t)offset != (int64_t)offset)
+		if (r_type != R_X86_64_64 &&
+		    (int32_t)offset != (int64_t)offset) {
+			if (is_in_xenpvh_assembly(offset))
+				break;
 			die("Relocation offset doesn't fit in 32 bits\n");
+		}
 
 		if (r_type == R_X86_64_64)
 			add_reloc(&relocs64, offset);
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 96f26e026783..210568e63c84 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -28,14 +28,15 @@ ENTRY(startup_xen)
 
 	/* Clear .bss */
 	xor %eax,%eax
-	mov $__bss_start, %_ASM_DI
-	mov $__bss_stop, %_ASM_CX
+	_ASM_MOVABS $__bss_start, %_ASM_DI
+	_ASM_MOVABS $__bss_stop, %_ASM_CX
 	sub %_ASM_DI, %_ASM_CX
 	shr $__ASM_SEL(2, 3), %_ASM_CX
 	rep __ASM_SIZE(stos)
 
-	mov %_ASM_SI, xen_start_info
-	mov $init_thread_union+THREAD_SIZE, %_ASM_SP
+	_ASM_MOVABS $xen_start_info, %_ASM_AX
+	_ASM_MOV %_ASM_SI, (%_ASM_AX)
+	_ASM_MOVABS $init_thread_union+THREAD_SIZE, %_ASM_SP
 
 #ifdef CONFIG_X86_64
 	/* Set up %gs.
@@ -46,7 +47,7 @@ ENTRY(startup_xen)
 	 * init data section till per cpu areas are set up.
 	 */
 	movl	$MSR_GS_BASE,%ecx
-	movq	$INIT_PER_CPU_VAR(irq_stack_union),%rax
+	movabsq	$INIT_PER_CPU_VAR(irq_stack_union),%rax
 	cdq
 	wrmsr
 #endif
diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S
index e1a5fbeae08d..43e234c7c2de 100644
--- a/arch/x86/xen/xen-pvh.S
+++ b/arch/x86/xen/xen-pvh.S
@@ -101,8 +101,8 @@ ENTRY(pvh_start_xen)
 	call xen_prepare_pvh
 
 	/* startup_64 expects boot_params in %rsi. */
-	mov $_pa(pvh_bootparams), %rsi
-	mov $_pa(startup_64), %rax
+	movabs $_pa(pvh_bootparams), %rsi
+	movabs $_pa(startup_64), %rax
 	jmp *%rax
 
 #else /* CONFIG_X86_64 */
@@ -137,10 +137,15 @@ END(pvh_start_xen)
 
 	.section ".init.data","aw"
 	.balign 8
+	/*
+	 * Use a quad for _pa(gdt_start) because PIE does not understand a
+	 * long is enough. The resulting value will still be in the lower long
+	 * part.
+	 */
 gdt:
 	.word gdt_end - gdt_start
-	.long _pa(gdt_start)
-	.word 0
+	.quad _pa(gdt_start)
+	.balign 8
 gdt_start:
 	.quad 0x0000000000000000            /* NULL descriptor */
 	.quad 0x0000000000000000            /* reserved */
-- 
2.16.2.660.g709887971b-goog

^ permalink raw reply related

* [PATCH v2 19/27] kvm: Adapt assembly for PIE support
From: Thomas Garnier via Virtualization @ 2018-03-13 20:59 UTC (permalink / raw)
  To: Herbert Xu, David S . Miller, Thomas Gleixner, Ingo Molnar,
	H . Peter Anvin, Peter Zijlstra, Josh Poimboeuf,
	Greg Kroah-Hartman, Kate Stewart, Thomas Garnier, Arnd Bergmann,
	Philippe Ombredanne, Arnaldo Carvalho de Melo, Andrey Ryabinin,
	Matthias Kaehlcke, Kees Cook, Tom Lendacky, Kirill A . Shutemov,
	Andy Lutomirski, Dominik Brodowski, Borislav Petkov,
	Borislav Petkov, Raf
  Cc: linux-arch, kvm, linux-pm, x86, linux-doc, linux-kernel,
	virtualization, linux-sparse, linux-crypto, kernel-hardening,
	xen-devel
In-Reply-To: <20180313205945.245105-1-thgarnie@google.com>

Change the assembly code to use only relative references of symbols for the
kernel to be PIE compatible. The new __ASM_MOVABS macro is used to
get the address of a symbol on both 32 and 64-bit with PIE support.

Position Independent Executable (PIE) support will allow to extended the
KASLR randomization range below the -2G memory limit.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
 arch/x86/include/asm/kvm_host.h | 6 ++++--
 arch/x86/kernel/kvm.c           | 6 ++++--
 arch/x86/kvm/svm.c              | 4 ++--
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b605a5b6a30c..7bd6ba79e778 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1380,9 +1380,11 @@ asmlinkage void kvm_spurious_fault(void);
 	".pushsection .fixup, \"ax\" \n" \
 	"667: \n\t" \
 	cleanup_insn "\n\t"		      \
-	"cmpb $0, kvm_rebooting \n\t"	      \
+	"cmpb $0, kvm_rebooting" __ASM_SEL(,(%%rip)) " \n\t" \
 	"jne 668b \n\t"      		      \
-	__ASM_SIZE(push) " $666b \n\t"	      \
+	__ASM_SIZE(push) "%%" _ASM_AX " \n\t"		\
+	_ASM_MOVABS " $666b, %%" _ASM_AX "\n\t"	\
+	"xchg %%" _ASM_AX ", (%%" _ASM_SP ") \n\t"	\
 	"call kvm_spurious_fault \n\t"	      \
 	".popsection \n\t" \
 	_ASM_EXTABLE(666b, 667b)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index bc1a27280c4b..5e4dd958ea95 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -711,8 +711,10 @@ asm(
 ".global __raw_callee_save___kvm_vcpu_is_preempted;"
 ".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
 "__raw_callee_save___kvm_vcpu_is_preempted:"
-"movq	__per_cpu_offset(,%rdi,8), %rax;"
-"cmpb	$0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
+"leaq	__per_cpu_offset(%rip), %rax;"
+"movq	(%rax,%rdi,8), %rax;"
+"addq	" __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rip), %rax;"
+"cmpb	$0, (%rax);"
 "setne	%al;"
 "ret;"
 ".popsection");
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index be9c839e2c89..6835a2ce02e5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -626,12 +626,12 @@ static u32 svm_msrpm_offset(u32 msr)
 
 static inline void clgi(void)
 {
-	asm volatile (__ex(SVM_CLGI));
+	asm volatile (__ex(SVM_CLGI) : :);
 }
 
 static inline void stgi(void)
 {
-	asm volatile (__ex(SVM_STGI));
+	asm volatile (__ex(SVM_STGI) : :);
 }
 
 static inline void invlpga(unsigned long addr, u32 asid)
-- 
2.16.2.660.g709887971b-goog

^ permalink raw reply related

* [PATCH v2 20/27] x86: Support global stack cookie
From: Thomas Garnier via Virtualization @ 2018-03-13 20:59 UTC (permalink / raw)
  To: Herbert Xu, David S . Miller, Thomas Gleixner, Ingo Molnar,
	H . Peter Anvin, Peter Zijlstra, Josh Poimboeuf,
	Greg Kroah-Hartman, Kate Stewart, Thomas Garnier, Arnd Bergmann,
	Philippe Ombredanne, Arnaldo Carvalho de Melo, Andrey Ryabinin,
	Matthias Kaehlcke, Kees Cook, Tom Lendacky, Kirill A . Shutemov,
	Andy Lutomirski, Dominik Brodowski, Borislav Petkov,
	Borislav Petkov, Raf
  Cc: linux-arch, kvm, linux-pm, x86, linux-doc, linux-kernel,
	virtualization, linux-sparse, linux-crypto, kernel-hardening,
	xen-devel
In-Reply-To: <20180313205945.245105-1-thgarnie@google.com>

Add an off-by-default configuration option to use a global stack cookie
instead of the default TLS. This configuration option will only be used
with PIE binaries.

For kernel stack cookie, the compiler uses the mcmodel=kernel to switch
between the fs segment to gs segment. A PIE binary does not use
mcmodel=kernel because it can be relocated anywhere, therefore the
compiler will default to the fs segment register. This is fixed on the
latest version of gcc.

If the segment selector is available, it will be automatically added. If
the automatic configuration was selected, a warning is written and the
global variable stack cookie is used. If a specific stack mode was
selected (regular or strong) and the compiler does not support selecting
the segment register, an error is emitted.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
 arch/x86/Kconfig                      | 12 ++++++++++++
 arch/x86/Makefile                     |  9 +++++++++
 arch/x86/entry/entry_32.S             |  3 ++-
 arch/x86/entry/entry_64.S             |  3 ++-
 arch/x86/include/asm/processor.h      |  3 ++-
 arch/x86/include/asm/stackprotector.h | 19 ++++++++++++++-----
 arch/x86/kernel/asm-offsets.c         |  3 ++-
 arch/x86/kernel/asm-offsets_32.c      |  3 ++-
 arch/x86/kernel/asm-offsets_64.c      |  3 ++-
 arch/x86/kernel/cpu/common.c          |  3 ++-
 arch/x86/kernel/head_32.S             |  3 ++-
 arch/x86/kernel/process.c             |  5 +++++
 12 files changed, 56 insertions(+), 13 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a0a777ce4c7c..0cb1ae187c3e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2236,6 +2236,18 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING
 
 	   If unsure, leave at the default value.
 
+config X86_GLOBAL_STACKPROTECTOR
+	bool "Stack cookie using a global variable"
+	depends on CC_STACKPROTECTOR_AUTO
+	default n
+	---help---
+	   This option turns on the "stack-protector" GCC feature using a global
+	   variable instead of a segment register. It is useful when the
+	   compiler does not support custom segment registers when building a
+	   position independent (PIE) binary.
+
+	   If unsure, say N
+
 config HOTPLUG_CPU
 	bool "Support for hot-pluggable CPUs"
 	depends on SMP
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 498c1b812300..16dafc551f3b 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -142,6 +142,15 @@ else
         KBUILD_CFLAGS += $(call cc-option,-funit-at-a-time)
 endif
 
+ifdef CONFIG_X86_GLOBAL_STACKPROTECTOR
+        ifeq ($(call cc-option, -mstack-protector-guard=global),)
+                $(error Cannot use CONFIG_X86_GLOBAL_STACKPROTECTOR: \
+                        -mstack-protector-guard=global not supported \
+                        by compiler)
+        endif
+        KBUILD_CFLAGS += -mstack-protector-guard=global
+endif
+
 ifdef CONFIG_X86_X32
 	x32_ld_ok := $(call try-run,\
 			/bin/echo -e '1: .quad 1b' | \
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index bef8e2b202a8..b7d5bc710ae7 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -239,7 +239,8 @@ ENTRY(__switch_to_asm)
 	movl	%esp, TASK_threadsp(%eax)
 	movl	TASK_threadsp(%edx), %esp
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#if defined(CONFIG_CC_STACKPROTECTOR) && \
+	!defined(CONFIG_X86_GLOBAL_STACKPROTECTOR)
 	movl	TASK_stack_canary(%edx), %ebx
 	movl	%ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
 #endif
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d34794368e20..fbf2b63b4e78 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -356,7 +356,8 @@ ENTRY(__switch_to_asm)
 	movq	%rsp, TASK_threadsp(%rdi)
 	movq	TASK_threadsp(%rsi), %rsp
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#if defined(CONFIG_CC_STACKPROTECTOR) && \
+	!defined(CONFIG_X86_GLOBAL_STACKPROTECTOR)
 	movq	TASK_stack_canary(%rsi), %rbx
 	movq	%rbx, PER_CPU_VAR(irq_stack_union + stack_canary_offset)
 #endif
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 1b9488b1018a..f11284481597 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -411,7 +411,8 @@ DECLARE_PER_CPU(char *, irq_stack_ptr);
 DECLARE_PER_CPU(unsigned int, irq_count);
 extern asmlinkage void ignore_sysret(void);
 #else	/* X86_64 */
-#ifdef CONFIG_CC_STACKPROTECTOR
+#if defined(CONFIG_CC_STACKPROTECTOR) && \
+	!defined(CONFIG_X86_GLOBAL_STACKPROTECTOR)
 /*
  * Make sure stack canary segment base is cached-aligned:
  *   "For Intel Atom processors, avoid non zero segment base address
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 371b3a4af000..5063f57d99f5 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -52,6 +52,10 @@
 #define GDT_STACK_CANARY_INIT						\
 	[GDT_ENTRY_STACK_CANARY] = GDT_ENTRY_INIT(0x4090, 0, 0x18),
 
+#ifdef CONFIG_X86_GLOBAL_STACKPROTECTOR
+extern unsigned long __stack_chk_guard;
+#endif
+
 /*
  * Initialize the stackprotector canary value.
  *
@@ -63,7 +67,7 @@ static __always_inline void boot_init_stack_canary(void)
 	u64 canary;
 	u64 tsc;
 
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_X86_GLOBAL_STACKPROTECTOR)
 	BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40);
 #endif
 	/*
@@ -77,17 +81,22 @@ static __always_inline void boot_init_stack_canary(void)
 	canary += tsc + (tsc << 32UL);
 	canary &= CANARY_MASK;
 
+#ifdef CONFIG_X86_GLOBAL_STACKPROTECTOR
+	if (__stack_chk_guard == 0)
+		__stack_chk_guard = canary ?: 1;
+#else /* !CONFIG_X86_GLOBAL_STACKPROTECTOR */
 	current->stack_canary = canary;
 #ifdef CONFIG_X86_64
 	this_cpu_write(irq_stack_union.stack_canary, canary);
-#else
+#else /* CONFIG_X86_32 */
 	this_cpu_write(stack_canary.canary, canary);
 #endif
+#endif
 }
 
 static inline void setup_stack_canary_segment(int cpu)
 {
-#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_GLOBAL_STACKPROTECTOR)
 	unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu);
 	struct desc_struct *gdt_table = get_cpu_gdt_rw(cpu);
 	struct desc_struct desc;
@@ -100,7 +109,7 @@ static inline void setup_stack_canary_segment(int cpu)
 
 static inline void load_stack_canary_segment(void)
 {
-#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_GLOBAL_STACKPROTECTOR)
 	asm("mov %0, %%gs" : : "r" (__KERNEL_STACK_CANARY) : "memory");
 #endif
 }
@@ -116,7 +125,7 @@ static inline void setup_stack_canary_segment(int cpu)
 
 static inline void load_stack_canary_segment(void)
 {
-#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_GLOBAL_STACKPROTECTOR)
 	asm volatile ("mov %0, %%gs" : : "r" (0));
 #endif
 }
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 76417a9aab73..4c9e1b667bda 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -32,7 +32,8 @@
 void common(void) {
 	BLANK();
 	OFFSET(TASK_threadsp, task_struct, thread.sp);
-#ifdef CONFIG_CC_STACKPROTECTOR
+#if defined(CONFIG_CC_STACKPROTECTOR) && \
+	!defined(CONFIG_X86_GLOBAL_STACKPROTECTOR)
 	OFFSET(TASK_stack_canary, task_struct, stack_canary);
 #endif
 
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index f91ba53e06c8..cf8ef55a8b82 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -50,7 +50,8 @@ void foo(void)
 	DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
 	       offsetofend(struct cpu_entry_area, entry_stack_page.stack));
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#if defined(CONFIG_CC_STACKPROTECTOR) && \
+	!defined(CONFIG_X86_GLOBAL_STACKPROTECTOR)
 	BLANK();
 	OFFSET(stack_canary_offset, stack_canary, canary);
 #endif
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index bf51e51d808d..a3c7e14f6434 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -69,7 +69,8 @@ int main(void)
 	OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
 	BLANK();
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#if defined(CONFIG_CC_STACKPROTECTOR) && \
+	!defined(CONFIG_X86_GLOBAL_STACKPROTECTOR)
 	DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary));
 	BLANK();
 #endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0e9a87a34f76..aa63274115af 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1509,7 +1509,8 @@ DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
 	(unsigned long)&init_thread_union + THREAD_SIZE;
 EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
 
-#ifdef CONFIG_CC_STACKPROTECTOR
+#if defined(CONFIG_CC_STACKPROTECTOR) && \
+	!defined(CONFIG_X86_GLOBAL_STACKPROTECTOR)
 DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
 
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index b59e4fb40fd9..0e849242de91 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -375,7 +375,8 @@ ENDPROC(startup_32_smp)
  */
 __INIT
 setup_once:
-#ifdef CONFIG_CC_STACKPROTECTOR
+#if defined(CONFIG_CC_STACKPROTECTOR) && \
+	!defined(CONFIG_X86_GLOBAL_STACKPROTECTOR)
 	/*
 	 * Configure the stack canary. The linker can't handle this by
 	 * relocation.  Manually set base address in stack canary
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 03408b942adb..ebe21d258a82 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -86,6 +86,11 @@ EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
 DEFINE_PER_CPU(bool, __tss_limit_invalid);
 EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
 
+#ifdef CONFIG_X86_GLOBAL_STACKPROTECTOR
+unsigned long __stack_chk_guard __read_mostly;
+EXPORT_SYMBOL(__stack_chk_guard);
+#endif
+
 /*
  * this gets called so that we can store lazy state into memory and copy the
  * current task into the new thread.
-- 
2.16.2.660.g709887971b-goog

^ permalink raw reply related

* [PATCH v2 21/27] x86/ftrace: Adapt function tracing for PIE support
From: Thomas Garnier via Virtualization @ 2018-03-13 20:59 UTC (permalink / raw)
  To: Herbert Xu, David S . Miller, Thomas Gleixner, Ingo Molnar,
	H . Peter Anvin, Peter Zijlstra, Josh Poimboeuf,
	Greg Kroah-Hartman, Kate Stewart, Thomas Garnier, Arnd Bergmann,
	Philippe Ombredanne, Arnaldo Carvalho de Melo, Andrey Ryabinin,
	Matthias Kaehlcke, Kees Cook, Tom Lendacky, Kirill A . Shutemov,
	Andy Lutomirski, Dominik Brodowski, Borislav Petkov,
	Borislav Petkov, Raf
  Cc: linux-arch, kvm, linux-pm, x86, linux-doc, linux-kernel,
	virtualization, linux-sparse, linux-crypto, kernel-hardening,
	xen-devel
In-Reply-To: <20180313205945.245105-1-thgarnie@google.com>

When using -fPIE/PIC with function tracing, the compiler generates a
call through the GOT (call *__fentry__@GOTPCREL). This instruction
takes 6 bytes instead of 5 on the usual relative call.

If PIE is enabled, replace the 6th byte of the GOT call by a 1-byte nop
so ftrace can handle the previous 5-bytes as before.

Position Independent Executable (PIE) support will allow to extended the
KASLR randomization range below the -2G memory limit.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
 arch/x86/include/asm/ftrace.h   |  6 +++--
 arch/x86/include/asm/sections.h |  4 ++++
 arch/x86/kernel/ftrace.c        | 42 +++++++++++++++++++++++++++++++--
 3 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 09ad88572746..61fa02d81b95 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -25,9 +25,11 @@ extern void __fentry__(void);
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
 {
 	/*
-	 * addr is the address of the mcount call instruction.
-	 * recordmcount does the necessary offset calculation.
+	 * addr is the address of the mcount call instruction. PIE has always a
+	 * byte added to the start of the function.
 	 */
+	if (IS_ENABLED(CONFIG_X86_PIE))
+		addr -= 1;
 	return addr;
 }
 
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index d6baf23782bc..cad292f62eed 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -12,4 +12,8 @@ extern struct exception_table_entry __stop___ex_table[];
 extern char __end_rodata_hpage_align[];
 #endif
 
+#if defined(CONFIG_X86_PIE)
+extern char __start_got[], __end_got[];
+#endif
+
 #endif	/* _ASM_X86_SECTIONS_H */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 01ebcb6f263e..21bde498f1a9 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -102,7 +102,7 @@ static const unsigned char *ftrace_nop_replace(void)
 
 static int
 ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
-		   unsigned const char *new_code)
+			  unsigned const char *new_code)
 {
 	unsigned char replaced[MCOUNT_INSN_SIZE];
 
@@ -135,6 +135,44 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
 	return 0;
 }
 
+/* Bytes before call GOT offset */
+const unsigned char got_call_preinsn[] = { 0xff, 0x15 };
+
+static int
+ftrace_modify_initial_code(unsigned long ip, unsigned const char *old_code,
+			   unsigned const char *new_code)
+{
+	unsigned char replaced[MCOUNT_INSN_SIZE + 1];
+
+	ftrace_expected = old_code;
+
+	/*
+	 * If PIE is not enabled or no GOT call was found, default to the
+	 * original approach to code modification.
+	 */
+	if (!IS_ENABLED(CONFIG_X86_PIE)
+	    || probe_kernel_read(replaced, (void *)ip, sizeof(replaced))
+	    || memcmp(replaced, got_call_preinsn, sizeof(got_call_preinsn)))
+		return ftrace_modify_code_direct(ip, old_code, new_code);
+
+	/*
+	 * Build a nop slide with a 5-byte nop and 1-byte nop to keep the ftrace
+	 * hooking algorithm working with the expected 5 bytes instruction.
+	 */
+	memcpy(replaced, new_code, MCOUNT_INSN_SIZE);
+	replaced[MCOUNT_INSN_SIZE] = ideal_nops[1][0];
+
+	ip = text_ip_addr(ip);
+
+	if (probe_kernel_write((void *)ip, replaced, sizeof(replaced)))
+		return -EPERM;
+
+	sync_core();
+
+	return 0;
+
+}
+
 int ftrace_make_nop(struct module *mod,
 		    struct dyn_ftrace *rec, unsigned long addr)
 {
@@ -153,7 +191,7 @@ int ftrace_make_nop(struct module *mod,
 	 * just modify the code directly.
 	 */
 	if (addr == MCOUNT_ADDR)
-		return ftrace_modify_code_direct(rec->ip, old, new);
+		return ftrace_modify_initial_code(rec->ip, old, new);
 
 	ftrace_expected = NULL;
 
-- 
2.16.2.660.g709887971b-goog

^ permalink raw reply related

* [PATCH v2 22/27] x86/modules: Add option to start module section after kernel
From: Thomas Garnier via Virtualization @ 2018-03-13 20:59 UTC (permalink / raw)
  To: Herbert Xu, David S . Miller, Thomas Gleixner, Ingo Molnar,
	H . Peter Anvin, Peter Zijlstra, Josh Poimboeuf,
	Greg Kroah-Hartman, Kate Stewart, Thomas Garnier, Arnd Bergmann,
	Philippe Ombredanne, Arnaldo Carvalho de Melo, Andrey Ryabinin,
	Matthias Kaehlcke, Kees Cook, Tom Lendacky, Kirill A . Shutemov,
	Andy Lutomirski, Dominik Brodowski, Borislav Petkov,
	Borislav Petkov, Raf
  Cc: linux-arch, kvm, linux-pm, x86, linux-doc, linux-kernel,
	virtualization, linux-sparse, linux-crypto, kernel-hardening,
	xen-devel
In-Reply-To: <20180313205945.245105-1-thgarnie@google.com>

Add an option so the module section is just after the mapped kernel. It
will ensure position independent modules are always at the right
distance from the kernel and do not require mcmodule=large. It also
optimize the available size for modules by getting rid of the empty
space on kernel randomization range.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
 Documentation/x86/x86_64/mm.txt         | 3 +++
 arch/x86/Kconfig                        | 4 ++++
 arch/x86/include/asm/pgtable_64_types.h | 6 ++++++
 arch/x86/kernel/head64.c                | 5 ++++-
 arch/x86/mm/dump_pagetables.c           | 3 ++-
 5 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index ea91cb61a602..ec1dfe4c3cfe 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -77,3 +77,6 @@ Their order is preserved but their base will be offset early at boot time.
 Be very careful vs. KASLR when changing anything here. The KASLR address
 range must not overlap with anything except the KASAN shadow area, which is
 correct as KASAN disables KASLR.
+
+If CONFIG_DYNAMIC_MODULE_BASE is enabled, the module section follows the end of
+the mapped kernel.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0cb1ae187c3e..df4134fd3247 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2236,6 +2236,10 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING
 
 	   If unsure, leave at the default value.
 
+# Module section starts just after the end of the kernel module
+config DYNAMIC_MODULE_BASE
+	bool
+
 config X86_GLOBAL_STACKPROTECTOR
 	bool "Stack cookie using a global variable"
 	depends on CC_STACKPROTECTOR_AUTO
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index d5c21a382475..d4d3b21d5b3d 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -7,6 +7,7 @@
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
 #include <asm/kaslr.h>
+#include <asm/sections.h>
 
 /*
  * These are used to make use of C type-checking..
@@ -126,7 +127,12 @@ extern unsigned int ptrs_per_p4d;
 
 #define VMALLOC_END		(VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1)
 
+#ifdef CONFIG_DYNAMIC_MODULE_BASE
+#define MODULES_VADDR		ALIGN(((unsigned long)_end + PAGE_SIZE), PMD_SIZE)
+#else
 #define MODULES_VADDR		(__START_KERNEL_map + KERNEL_IMAGE_SIZE)
+#endif
+
 /* The module sections ends with the start of the fixmap */
 #define MODULES_END		_AC(0xffffffffff000000, UL)
 #define MODULES_LEN		(MODULES_END - MODULES_VADDR)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 2fe60e661227..ea4c498369d8 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -384,12 +384,15 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
 	 * Build-time sanity checks on the kernel image and module
 	 * area mappings. (these are purely build-time and produce no code)
 	 */
+#ifndef CONFIG_DYNAMIC_MODULE_BASE
 	BUILD_BUG_ON(MODULES_VADDR < __START_KERNEL_map);
 	BUILD_BUG_ON(MODULES_VADDR - __START_KERNEL_map < KERNEL_IMAGE_SIZE);
-	BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
+	BUILD_BUG_ON(!IS_ENABLED(CONFIG_RANDOMIZE_BASE_LARGE) &&
+		     MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
 	BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0);
 	BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
 	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
+#endif
 	MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
 				(__START_KERNEL & PGDIR_MASK)));
 	BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 62a7e9f65dec..6f0b1fa2a71a 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -104,7 +104,7 @@ static struct addr_marker address_markers[] = {
 	[EFI_END_NR]		= { EFI_VA_END,		"EFI Runtime Services" },
 #endif
 	[HIGH_KERNEL_NR]	= { __START_KERNEL_map,	"High Kernel Mapping" },
-	[MODULES_VADDR_NR]	= { MODULES_VADDR,	"Modules" },
+	[MODULES_VADDR_NR]	= { 0/*MODULES_VADDR*/,	"Modules" },
 	[MODULES_END_NR]	= { MODULES_END,	"End Modules" },
 	[FIXADDR_START_NR]	= { FIXADDR_START,	"Fixmap Area" },
 	[END_OF_SPACE_NR]	= { -1,			NULL }
@@ -599,6 +599,7 @@ static int __init pt_dump_init(void)
 	address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START;
 	address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END;
 #endif
+	address_markers[MODULES_VADDR_NR].start_address = MODULES_VADDR;
 #endif
 #ifdef CONFIG_X86_32
 	address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
-- 
2.16.2.660.g709887971b-goog

^ permalink raw reply related

* [PATCH v2 23/27] x86/modules: Adapt module loading for PIE support
From: Thomas Garnier via Virtualization @ 2018-03-13 20:59 UTC (permalink / raw)
  To: Herbert Xu, David S . Miller, Thomas Gleixner, Ingo Molnar,
	H . Peter Anvin, Peter Zijlstra, Josh Poimboeuf,
	Greg Kroah-Hartman, Kate Stewart, Thomas Garnier, Arnd Bergmann,
	Philippe Ombredanne, Arnaldo Carvalho de Melo, Andrey Ryabinin,
	Matthias Kaehlcke, Kees Cook, Tom Lendacky, Kirill A . Shutemov,
	Andy Lutomirski, Dominik Brodowski, Borislav Petkov,
	Borislav Petkov, Raf
  Cc: linux-arch, kvm, linux-pm, x86, linux-doc, linux-kernel,
	virtualization, linux-sparse, linux-crypto, kernel-hardening,
	xen-devel
In-Reply-To: <20180313205945.245105-1-thgarnie@google.com>

Adapt module loading to support PIE relocations. Generate dynamic GOT if
a symbol requires it but no entry exist in the kernel GOT.

Position Independent Executable (PIE) support will allow to extended the
KASLR randomization range below the -2G memory limit.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
 arch/x86/Makefile               |   4 +
 arch/x86/include/asm/module.h   |  11 ++
 arch/x86/include/asm/sections.h |   4 +
 arch/x86/kernel/module.c        | 181 +++++++++++++++++++++++++++++++-
 arch/x86/kernel/module.lds      |   3 +
 5 files changed, 198 insertions(+), 5 deletions(-)
 create mode 100644 arch/x86/kernel/module.lds

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 16dafc551f3b..f24d200c0d9d 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -135,7 +135,11 @@ else
         KBUILD_CFLAGS += $(cflags-y)
 
         KBUILD_CFLAGS += -mno-red-zone
+ifdef CONFIG_X86_PIE
+        KBUILD_LDFLAGS_MODULE += -T $(srctree)/arch/x86/kernel/module.lds
+else
         KBUILD_CFLAGS += -mcmodel=kernel
+endif
 
         # -funit-at-a-time shrinks the kernel .text considerably
         # unfortunately it makes reading oopses harder.
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index 7948a17febb4..68ff05e14288 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -5,12 +5,23 @@
 #include <asm-generic/module.h>
 #include <asm/orc_types.h>
 
+#ifdef CONFIG_X86_PIE
+struct mod_got_sec {
+	struct elf64_shdr	*got;
+	int			got_num_entries;
+	int			got_max_entries;
+};
+#endif
+
 struct mod_arch_specific {
 #ifdef CONFIG_UNWINDER_ORC
 	unsigned int num_orcs;
 	int *orc_unwind_ip;
 	struct orc_entry *orc_unwind;
 #endif
+#ifdef CONFIG_X86_PIE
+	struct mod_got_sec	core;
+#endif
 };
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index cad292f62eed..0bbd9f941573 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -16,4 +16,8 @@ extern char __end_rodata_hpage_align[];
 extern char __start_got[], __end_got[];
 #endif
 
+#if defined(CONFIG_X86_PIE)
+extern char __start_got[], __end_got[];
+#endif
+
 #endif	/* _ASM_X86_SECTIONS_H */
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index f58336af095c..88895f3d474b 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -30,6 +30,7 @@
 #include <linux/gfp.h>
 #include <linux/jump_label.h>
 #include <linux/random.h>
+#include <linux/sort.h>
 
 #include <asm/text-patching.h>
 #include <asm/page.h>
@@ -77,6 +78,173 @@ static unsigned long int get_module_load_offset(void)
 }
 #endif
 
+#ifdef CONFIG_X86_PIE
+static u64 find_got_kernel_entry(Elf64_Sym *sym, const Elf64_Rela *rela)
+{
+	u64 *pos;
+
+	for (pos = (u64*)__start_got; pos < (u64*)__end_got; pos++) {
+		if (*pos == sym->st_value)
+			return (u64)pos + rela->r_addend;
+	}
+
+	return 0;
+}
+
+static u64 module_emit_got_entry(struct module *mod, void *loc,
+				 const Elf64_Rela *rela, Elf64_Sym *sym)
+{
+	struct mod_got_sec *gotsec = &mod->arch.core;
+	u64 *got = (u64*)gotsec->got->sh_addr;
+	int i = gotsec->got_num_entries;
+	u64 ret;
+
+	/* Check if we can use the kernel GOT */
+	ret = find_got_kernel_entry(sym, rela);
+	if (ret)
+		return ret;
+
+	got[i] = sym->st_value;
+
+	/*
+	 * Check if the entry we just created is a duplicate. Given that the
+	 * relocations are sorted, this will be the last entry we allocated.
+	 * (if one exists).
+	 */
+	if (i > 0 && got[i] == got[i - 2]) {
+		ret = (u64)&got[i - 1];
+	} else {
+		gotsec->got_num_entries++;
+		BUG_ON(gotsec->got_num_entries > gotsec->got_max_entries);
+		ret = (u64)&got[i];
+	}
+
+	return ret + rela->r_addend;
+}
+
+#define cmp_3way(a,b)	((a) < (b) ? -1 : (a) > (b))
+
+static int cmp_rela(const void *a, const void *b)
+{
+	const Elf64_Rela *x = a, *y = b;
+	int i;
+
+	/* sort by type, symbol index and addend */
+	i = cmp_3way(ELF64_R_TYPE(x->r_info), ELF64_R_TYPE(y->r_info));
+	if (i == 0)
+		i = cmp_3way(ELF64_R_SYM(x->r_info), ELF64_R_SYM(y->r_info));
+	if (i == 0)
+		i = cmp_3way(x->r_addend, y->r_addend);
+	return i;
+}
+
+static bool duplicate_rel(const Elf64_Rela *rela, int num)
+{
+	/*
+	 * Entries are sorted by type, symbol index and addend. That means
+	 * that, if a duplicate entry exists, it must be in the preceding
+	 * slot.
+	 */
+	return num > 0 && cmp_rela(rela + num, rela + num - 1) == 0;
+}
+
+static unsigned int count_gots(Elf64_Sym *syms, Elf64_Rela *rela, int num)
+{
+	unsigned int ret = 0;
+	Elf64_Sym *s;
+	int i;
+
+	for (i = 0; i < num; i++) {
+		switch (ELF64_R_TYPE(rela[i].r_info)) {
+		case R_X86_64_GOTPCREL:
+			s = syms + ELF64_R_SYM(rela[i].r_info);
+
+			/*
+			 * Use the kernel GOT when possible, else reserve a
+			 * custom one for this module.
+			 */
+			if (!duplicate_rel(rela, i) &&
+			    !find_got_kernel_entry(s, rela + i))
+				ret++;
+			break;
+		}
+	}
+	return ret;
+}
+
+/*
+ * Generate GOT entries for GOTPCREL relocations that do not exists in the
+ * kernel GOT. Based on arm64 module-plts implementation.
+ */
+int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+			      char *secstrings, struct module *mod)
+{
+	unsigned long gots = 0;
+	Elf_Shdr *symtab = NULL;
+	Elf64_Sym *syms = NULL;
+	char *strings, *name;
+	int i;
+
+	/*
+	 * Find the empty .got section so we can expand it to store the PLT
+	 * entries. Record the symtab address as well.
+	 */
+	for (i = 0; i < ehdr->e_shnum; i++) {
+		if (!strcmp(secstrings + sechdrs[i].sh_name, ".got")) {
+			mod->arch.core.got = sechdrs + i;
+		} else if (sechdrs[i].sh_type == SHT_SYMTAB) {
+			symtab = sechdrs + i;
+			syms = (Elf64_Sym *)symtab->sh_addr;
+		}
+	}
+
+	if (!mod->arch.core.got) {
+		pr_err("%s: module GOT section missing\n", mod->name);
+		return -ENOEXEC;
+	}
+	if (!syms) {
+		pr_err("%s: module symtab section missing\n", mod->name);
+		return -ENOEXEC;
+	}
+
+	for (i = 0; i < ehdr->e_shnum; i++) {
+		Elf64_Rela *rels = (void *)ehdr + sechdrs[i].sh_offset;
+		int numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela);
+
+		if (sechdrs[i].sh_type != SHT_RELA)
+			continue;
+
+		/* sort by type, symbol index and addend */
+		sort(rels, numrels, sizeof(Elf64_Rela), cmp_rela, NULL);
+
+		gots += count_gots(syms, rels, numrels);
+	}
+
+	mod->arch.core.got->sh_type = SHT_NOBITS;
+	mod->arch.core.got->sh_flags = SHF_ALLOC;
+	mod->arch.core.got->sh_addralign = L1_CACHE_BYTES;
+	mod->arch.core.got->sh_size = (gots + 1) * sizeof(u64);
+	mod->arch.core.got_num_entries = 0;
+	mod->arch.core.got_max_entries = gots;
+
+	/*
+	 * If a _GLOBAL_OFFSET_TABLE_ symbol exists, make it absolute for
+	 * modules to correctly reference it. Similar to s390 implementation.
+	 */
+	strings = (void *) ehdr + sechdrs[symtab->sh_link].sh_offset;
+	for (i = 0; i < symtab->sh_size/sizeof(Elf_Sym); i++) {
+		if (syms[i].st_shndx != SHN_UNDEF)
+			continue;
+		name = strings + syms[i].st_name;
+		if (!strcmp(name, "_GLOBAL_OFFSET_TABLE_")) {
+			syms[i].st_shndx = SHN_ABS;
+			break;
+		}
+	}
+	return 0;
+}
+#endif
+
 void *module_alloc(unsigned long size)
 {
 	void *p;
@@ -190,16 +358,20 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 			if ((s64)val != *(s32 *)loc)
 				goto overflow;
 			break;
+#ifdef CONFIG_X86_PIE
+		case R_X86_64_GOTPCREL:
+			val = module_emit_got_entry(me, loc, rel + i, sym);
+			/* fallthrough */
+#endif
 		case R_X86_64_PC32:
 		case R_X86_64_PLT32:
 			if (*(u32 *)loc != 0)
 				goto invalid_relocation;
 			val -= (u64)loc;
 			*(u32 *)loc = val;
-#if 0
-			if ((s64)val != *(s32 *)loc)
+			if (IS_ENABLED(CONFIG_X86_PIE) &&
+			    (s64)val != *(s32 *)loc)
 				goto overflow;
-#endif
 			break;
 		default:
 			pr_err("%s: Unknown rela relocation: %llu\n",
@@ -217,8 +389,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
 overflow:
 	pr_err("overflow in relocation type %d val %Lx\n",
 	       (int)ELF64_R_TYPE(rel[i].r_info), val);
-	pr_err("`%s' likely not compiled with -mcmodel=kernel\n",
-	       me->name);
+	pr_err("`%s' likely too far from the kernel\n", me->name);
 	return -ENOEXEC;
 }
 #endif
diff --git a/arch/x86/kernel/module.lds b/arch/x86/kernel/module.lds
new file mode 100644
index 000000000000..fd6e95a4b454
--- /dev/null
+++ b/arch/x86/kernel/module.lds
@@ -0,0 +1,3 @@
+SECTIONS {
+	.got (NOLOAD) : { BYTE(0) }
+}
-- 
2.16.2.660.g709887971b-goog

^ permalink raw reply related

* [PATCH v2 24/27] x86/mm: Make the x86 GOT read-only
From: Thomas Garnier via Virtualization @ 2018-03-13 20:59 UTC (permalink / raw)
  To: Herbert Xu, David S . Miller, Thomas Gleixner, Ingo Molnar,
	H . Peter Anvin, Peter Zijlstra, Josh Poimboeuf,
	Greg Kroah-Hartman, Kate Stewart, Thomas Garnier, Arnd Bergmann,
	Philippe Ombredanne, Arnaldo Carvalho de Melo, Andrey Ryabinin,
	Matthias Kaehlcke, Kees Cook, Tom Lendacky, Kirill A . Shutemov,
	Andy Lutomirski, Dominik Brodowski, Borislav Petkov,
	Borislav Petkov, Raf
  Cc: linux-arch, kvm, linux-pm, x86, linux-doc, linux-kernel,
	virtualization, linux-sparse, linux-crypto, kernel-hardening,
	xen-devel
In-Reply-To: <20180313205945.245105-1-thgarnie@google.com>

The GOT is changed during early boot when relocations are applied. Make
it read-only directly. This table exists only for PIE binary.

Position Independent Executable (PIE) support will allow to extended the
KASLR randomization range below the -2G memory limit.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
 include/asm-generic/vmlinux.lds.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 1ab0e520d6fc..89398d042f78 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -295,6 +295,17 @@
 	VMLINUX_SYMBOL(__end_ro_after_init) = .;
 #endif
 
+#ifdef CONFIG_X86_PIE
+#define RO_GOT_X86							\
+	.got        : AT(ADDR(.got) - LOAD_OFFSET) {			\
+		VMLINUX_SYMBOL(__start_got) = .;			\
+		*(.got);						\
+		VMLINUX_SYMBOL(__end_got) = .;				\
+	}
+#else
+#define RO_GOT_X86
+#endif
+
 /*
  * Read only Data
  */
@@ -351,6 +362,7 @@
 		VMLINUX_SYMBOL(__end_builtin_fw) = .;			\
 	}								\
 									\
+	RO_GOT_X86							\
 	TRACEDATA							\
 									\
 	/* Kernel symbol table: Normal symbols */			\
-- 
2.16.2.660.g709887971b-goog

^ permalink raw reply related

* [PATCH v2 25/27] x86/pie: Add option to build the kernel as PIE
From: Thomas Garnier via Virtualization @ 2018-03-13 20:59 UTC (permalink / raw)
  To: Herbert Xu, David S . Miller, Thomas Gleixner, Ingo Molnar,
	H . Peter Anvin, Peter Zijlstra, Josh Poimboeuf,
	Greg Kroah-Hartman, Kate Stewart, Thomas Garnier, Arnd Bergmann,
	Philippe Ombredanne, Arnaldo Carvalho de Melo, Andrey Ryabinin,
	Matthias Kaehlcke, Kees Cook, Tom Lendacky, Kirill A . Shutemov,
	Andy Lutomirski, Dominik Brodowski, Borislav Petkov,
	Borislav Petkov, Raf
  Cc: linux-arch, kvm, linux-pm, x86, linux-doc, linux-kernel,
	virtualization, linux-sparse, linux-crypto, kernel-hardening,
	xen-devel
In-Reply-To: <20180313205945.245105-1-thgarnie@google.com>

Add the CONFIG_X86_PIE option which builds the kernel as a Position
Independent Executable (PIE). The kernel is currently build with the
mcmodel=kernel option which forces it to stay on the top 2G of the
virtual address space. With PIE, the kernel will be able to move below
the current limit.

The --emit-relocs linker option was kept instead of using -pie to limit
the impact on mapped sections. Any incompatible relocation will be
catch by the arch/x86/tools/relocs binary at compile time.

If segment based stack cookies are enabled, try to use the compiler
option to select the segment register. If not available, automatically
enabled global stack cookie in auto mode. Otherwise, recommend
compiler update or global stack cookie option.

Performance/Size impact:

Size of vmlinux (Default configuration):
 File size:
 - PIE disabled: +0.18%
 - PIE enabled: -1.977% (less relocations)
 .text section:
 - PIE disabled: same
 - PIE enabled: same

Size of vmlinux (Ubuntu configuration):
 File size:
 - PIE disabled: +0.21%
 - PIE enabled: +10%
 .text section:
 - PIE disabled: same
 - PIE enabled: +0.001%

The size increase is mainly due to not having access to the 32-bit signed
relocation that can be used with mcmodel=kernel. A small part is due to reduced
optimization for PIE code. This bug [1] was opened with gcc to provide a better
code generation for kernel PIE.

Hackbench (50% and 1600% on thread/process for pipe/sockets):
 - PIE disabled: no significant change (avg -/+ 0.5% on latest test).
 - PIE enabled: between -1% to +1% in average (default and Ubuntu config).

Kernbench (average of 10 Half and Optimal runs):
 Elapsed Time:
 - PIE disabled: no significant change (avg -0.5%)
 - PIE enabled: average -0.5% to +0.5%
 System Time:
 - PIE disabled: no significant change (avg -0.1%)
 - PIE enabled: average -0.4% to +0.4%.

[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82303

Signed-off-by: Thomas Garnier <thgarnie@google.com>

merge pie
---
 arch/x86/Kconfig  |  8 ++++++++
 arch/x86/Makefile | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index df4134fd3247..4b1615e661d6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2252,6 +2252,14 @@ config X86_GLOBAL_STACKPROTECTOR
 
 	   If unsure, say N
 
+config X86_PIE
+	bool
+	depends on X86_64
+	select DEFAULT_HIDDEN
+	select WEAK_PROVIDE_HIDDEN
+	select DYNAMIC_MODULE_BASE
+	select MODULE_REL_CRCS if MODVERSIONS
+
 config HOTPLUG_CPU
 	bool "Support for hot-pluggable CPUs"
 	depends on SMP
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index f24d200c0d9d..ab0cf88c7059 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -61,6 +61,8 @@ endif
 KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
 KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
 
+stackglobal := $(call cc-option-yn, -mstack-protector-guard=global)
+
 ifeq ($(CONFIG_X86_32),y)
         BITS := 32
         UTS_MACHINE := i386
@@ -136,7 +138,48 @@ else
 
         KBUILD_CFLAGS += -mno-red-zone
 ifdef CONFIG_X86_PIE
+        KBUILD_CFLAGS += -fPIE
         KBUILD_LDFLAGS_MODULE += -T $(srctree)/arch/x86/kernel/module.lds
+
+        # Relax relocation in both CFLAGS and LDFLAGS to support older compilers
+        KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no)
+        LDFLAGS_vmlinux += $(call ld-option,--no-relax)
+        KBUILD_LDFLAGS_MODULE += $(call ld-option,--no-relax)
+
+        # Stack validation is not yet support due to self-referenced switches
+ifdef CONFIG_STACK_VALIDATION
+        $(warning CONFIG_STACK_VALIDATION is not yet supported for x86_64 pie \
+	        build.)
+        SKIP_STACK_VALIDATION := 1
+        export SKIP_STACK_VALIDATION
+endif
+
+ifndef CONFIG_CC_STACKPROTECTOR_NONE
+ifndef CONFIG_X86_GLOBAL_STACKPROTECTOR
+        stackseg-flag := -mstack-protector-guard-reg=%gs
+        ifeq ($(call cc-option-yn,$(stackseg-flag)),n)
+                # Try to enable global stack cookie if possible
+                ifeq ($(stackglobal), y)
+                        $(warning Cannot use CONFIG_CC_STACKPROTECTOR_* while \
+                                building a position independent kernel. \
+                                Default to global stack protector \
+                                (CONFIG_X86_GLOBAL_STACKPROTECTOR).)
+                        CONFIG_X86_GLOBAL_STACKPROTECTOR := y
+                        KBUILD_CFLAGS += -DCONFIG_X86_GLOBAL_STACKPROTECTOR
+                        KBUILD_AFLAGS += -DCONFIG_X86_GLOBAL_STACKPROTECTOR
+                else
+                        $(error echo Cannot use \
+                                CONFIG_CC_STACKPROTECTOR_(REGULAR|STRONG|AUTO) \
+                                while building a position independent binary. \
+                                Update your compiler or use \
+                                CONFIG_X86_GLOBAL_STACKPROTECTOR)
+                endif
+        else
+                KBUILD_CFLAGS += $(stackseg-flag)
+        endif
+endif
+endif
+
 else
         KBUILD_CFLAGS += -mcmodel=kernel
 endif
@@ -147,7 +190,7 @@ endif
 endif
 
 ifdef CONFIG_X86_GLOBAL_STACKPROTECTOR
-        ifeq ($(call cc-option, -mstack-protector-guard=global),)
+        ifeq ($(stackglobal), n)
                 $(error Cannot use CONFIG_X86_GLOBAL_STACKPROTECTOR: \
                         -mstack-protector-guard=global not supported \
                         by compiler)
-- 
2.16.2.660.g709887971b-goog

^ permalink raw reply related

* [PATCH v2 26/27] x86/relocs: Add option to generate 64-bit relocations
From: Thomas Garnier via Virtualization @ 2018-03-13 20:59 UTC (permalink / raw)
  To: Herbert Xu, David S . Miller, Thomas Gleixner, Ingo Molnar,
	H . Peter Anvin, Peter Zijlstra, Josh Poimboeuf,
	Greg Kroah-Hartman, Kate Stewart, Thomas Garnier, Arnd Bergmann,
	Philippe Ombredanne, Arnaldo Carvalho de Melo, Andrey Ryabinin,
	Matthias Kaehlcke, Kees Cook, Tom Lendacky, Kirill A . Shutemov,
	Andy Lutomirski, Dominik Brodowski, Borislav Petkov,
	Borislav Petkov, Raf
  Cc: linux-arch, kvm, linux-pm, x86, linux-doc, linux-kernel,
	virtualization, linux-sparse, linux-crypto, kernel-hardening,
	xen-devel
In-Reply-To: <20180313205945.245105-1-thgarnie@google.com>

The x86 relocation tool generates a list of 32-bit signed integers. There
was no need to use 64-bit integers because all addresses where above the 2G
top of the memory.

This change add a large-reloc option to generate 64-bit unsigned integers.
It can be used when the kernel plan to go below the top 2G and 32-bit
integers are not enough.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
 arch/x86/tools/relocs.c        | 60 +++++++++++++++++++++++++++-------
 arch/x86/tools/relocs.h        |  4 +--
 arch/x86/tools/relocs_common.c | 15 ++++++---
 3 files changed, 60 insertions(+), 19 deletions(-)

diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 29283ad3950f..a29cccceaac6 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -13,8 +13,14 @@
 
 static Elf_Ehdr ehdr;
 
+#if ELF_BITS == 64
+typedef uint64_t rel_off_t;
+#else
+typedef uint32_t rel_off_t;
+#endif
+
 struct relocs {
-	uint32_t	*offset;
+	rel_off_t	*offset;
 	unsigned long	count;
 	unsigned long	size;
 };
@@ -685,7 +691,7 @@ static void print_absolute_relocs(void)
 		printf("\n");
 }
 
-static void add_reloc(struct relocs *r, uint32_t offset)
+static void add_reloc(struct relocs *r, rel_off_t offset)
 {
 	if (r->count == r->size) {
 		unsigned long newsize = r->size + 50000;
@@ -1061,26 +1067,48 @@ static void sort_relocs(struct relocs *r)
 	qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs);
 }
 
-static int write32(uint32_t v, FILE *f)
+static int write32(rel_off_t rel, FILE *f)
 {
-	unsigned char buf[4];
+	unsigned char buf[sizeof(uint32_t)];
+	uint32_t v = (uint32_t)rel;
 
 	put_unaligned_le32(v, buf);
-	return fwrite(buf, 1, 4, f) == 4 ? 0 : -1;
+	return fwrite(buf, 1, sizeof(buf), f) == sizeof(buf) ? 0 : -1;
 }
 
-static int write32_as_text(uint32_t v, FILE *f)
+static int write32_as_text(rel_off_t rel, FILE *f)
 {
+	uint32_t v = (uint32_t)rel;
 	return fprintf(f, "\t.long 0x%08"PRIx32"\n", v) > 0 ? 0 : -1;
 }
 
-static void emit_relocs(int as_text, int use_real_mode)
+static int write64(rel_off_t rel, FILE *f)
+{
+	unsigned char buf[sizeof(uint64_t)];
+	uint64_t v = (uint64_t)rel;
+
+	put_unaligned_le64(v, buf);
+	return fwrite(buf, 1, sizeof(buf), f) == sizeof(buf) ? 0 : -1;
+}
+
+static int write64_as_text(rel_off_t rel, FILE *f)
+{
+	uint64_t v = (uint64_t)rel;
+	return fprintf(f, "\t.quad 0x%016"PRIx64"\n", v) > 0 ? 0 : -1;
+}
+
+static void emit_relocs(int as_text, int use_real_mode, int use_large_reloc)
 {
 	int i;
-	int (*write_reloc)(uint32_t, FILE *) = write32;
+	int (*write_reloc)(rel_off_t, FILE *);
 	int (*do_reloc)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym,
 			const char *symname);
 
+	if (use_large_reloc)
+		write_reloc = write64;
+	else
+		write_reloc = write32;
+
 #if ELF_BITS == 64
 	if (!use_real_mode)
 		do_reloc = do_reloc64;
@@ -1091,6 +1119,9 @@ static void emit_relocs(int as_text, int use_real_mode)
 		do_reloc = do_reloc32;
 	else
 		do_reloc = do_reloc_real;
+
+	/* Large relocations only for 64-bit */
+	use_large_reloc = 0;
 #endif
 
 	/* Collect up the relocations */
@@ -1114,8 +1145,13 @@ static void emit_relocs(int as_text, int use_real_mode)
 		 * gas will like.
 		 */
 		printf(".section \".data.reloc\",\"a\"\n");
-		printf(".balign 4\n");
-		write_reloc = write32_as_text;
+		if (use_large_reloc) {
+			printf(".balign 8\n");
+			write_reloc = write64_as_text;
+		} else {
+			printf(".balign 4\n");
+			write_reloc = write32_as_text;
+		}
 	}
 
 	if (use_real_mode) {
@@ -1183,7 +1219,7 @@ static void print_reloc_info(void)
 
 void process(FILE *fp, int use_real_mode, int as_text,
 	     int show_absolute_syms, int show_absolute_relocs,
-	     int show_reloc_info)
+	     int show_reloc_info, int use_large_reloc)
 {
 	regex_init(use_real_mode);
 	read_ehdr(fp);
@@ -1206,5 +1242,5 @@ void process(FILE *fp, int use_real_mode, int as_text,
 		print_reloc_info();
 		return;
 	}
-	emit_relocs(as_text, use_real_mode);
+	emit_relocs(as_text, use_real_mode, use_large_reloc);
 }
diff --git a/arch/x86/tools/relocs.h b/arch/x86/tools/relocs.h
index 43c83c0fd22c..3d401da59df7 100644
--- a/arch/x86/tools/relocs.h
+++ b/arch/x86/tools/relocs.h
@@ -31,8 +31,8 @@ enum symtype {
 
 void process_32(FILE *fp, int use_real_mode, int as_text,
 		int show_absolute_syms, int show_absolute_relocs,
-		int show_reloc_info);
+		int show_reloc_info, int use_large_reloc);
 void process_64(FILE *fp, int use_real_mode, int as_text,
 		int show_absolute_syms, int show_absolute_relocs,
-		int show_reloc_info);
+		int show_reloc_info, int use_large_reloc);
 #endif /* RELOCS_H */
diff --git a/arch/x86/tools/relocs_common.c b/arch/x86/tools/relocs_common.c
index 6634352a20bc..11f49adf1c06 100644
--- a/arch/x86/tools/relocs_common.c
+++ b/arch/x86/tools/relocs_common.c
@@ -12,14 +12,14 @@ void die(char *fmt, ...)
 
 static void usage(void)
 {
-	die("relocs [--abs-syms|--abs-relocs|--reloc-info|--text|--realmode]" \
-	    " vmlinux\n");
+	die("relocs [--abs-syms|--abs-relocs|--reloc-info|--text|--realmode|" \
+	    "--large-reloc]  vmlinux\n");
 }
 
 int main(int argc, char **argv)
 {
 	int show_absolute_syms, show_absolute_relocs, show_reloc_info;
-	int as_text, use_real_mode;
+	int as_text, use_real_mode, use_large_reloc;
 	const char *fname;
 	FILE *fp;
 	int i;
@@ -30,6 +30,7 @@ int main(int argc, char **argv)
 	show_reloc_info = 0;
 	as_text = 0;
 	use_real_mode = 0;
+	use_large_reloc = 0;
 	fname = NULL;
 	for (i = 1; i < argc; i++) {
 		char *arg = argv[i];
@@ -54,6 +55,10 @@ int main(int argc, char **argv)
 				use_real_mode = 1;
 				continue;
 			}
+			if (strcmp(arg, "--large-reloc") == 0) {
+				use_large_reloc = 1;
+				continue;
+			}
 		}
 		else if (!fname) {
 			fname = arg;
@@ -75,11 +80,11 @@ int main(int argc, char **argv)
 	if (e_ident[EI_CLASS] == ELFCLASS64)
 		process_64(fp, use_real_mode, as_text,
 			   show_absolute_syms, show_absolute_relocs,
-			   show_reloc_info);
+			   show_reloc_info, use_large_reloc);
 	else
 		process_32(fp, use_real_mode, as_text,
 			   show_absolute_syms, show_absolute_relocs,
-			   show_reloc_info);
+			   show_reloc_info, use_large_reloc);
 	fclose(fp);
 	return 0;
 }
-- 
2.16.2.660.g709887971b-goog

^ permalink raw reply related

* [PATCH v2 27/27] x86/kaslr: Add option to extend KASLR range from 1GB to 3GB
From: Thomas Garnier via Virtualization @ 2018-03-13 20:59 UTC (permalink / raw)
  To: Herbert Xu, David S . Miller, Thomas Gleixner, Ingo Molnar,
	H . Peter Anvin, Peter Zijlstra, Josh Poimboeuf,
	Greg Kroah-Hartman, Kate Stewart, Thomas Garnier, Arnd Bergmann,
	Philippe Ombredanne, Arnaldo Carvalho de Melo, Andrey Ryabinin,
	Matthias Kaehlcke, Kees Cook, Tom Lendacky, Kirill A . Shutemov,
	Andy Lutomirski, Dominik Brodowski, Borislav Petkov,
	Borislav Petkov, Raf
  Cc: linux-arch, kvm, linux-pm, x86, linux-doc, linux-kernel,
	virtualization, linux-sparse, linux-crypto, kernel-hardening,
	xen-devel
In-Reply-To: <20180313205945.245105-1-thgarnie@google.com>

Add a new CONFIG_RANDOMIZE_BASE_LARGE option to benefit from PIE
support. It increases the KASLR range from 1GB to 3GB. The new range
stars at 0xffffffff00000000 just above the EFI memory region. This
option is off by default.

The boot code is adapted to create the appropriate page table spanning
three PUD pages.

The relocation table uses 64-bit integers generated with the updated
relocation tool with the large-reloc option.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
---
 arch/x86/Kconfig                     | 21 +++++++++++++++++++++
 arch/x86/boot/compressed/Makefile    |  5 +++++
 arch/x86/boot/compressed/misc.c      | 10 +++++++++-
 arch/x86/include/asm/page_64_types.h |  9 +++++++++
 arch/x86/kernel/head64.c             | 15 ++++++++++++---
 arch/x86/kernel/head_64.S            | 11 ++++++++++-
 6 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4b1615e661d6..7ea69cb0153f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2260,6 +2260,27 @@ config X86_PIE
 	select DYNAMIC_MODULE_BASE
 	select MODULE_REL_CRCS if MODVERSIONS
 
+config RANDOMIZE_BASE_LARGE
+	bool "Increase the randomization range of the kernel image"
+	depends on X86_64 && RANDOMIZE_BASE
+	select X86_PIE
+	select X86_MODULE_PLTS if MODULES
+	default n
+	---help---
+	  Build the kernel as a Position Independent Executable (PIE) and
+	  increase the available randomization range from 1GB to 3GB.
+
+	  This option impacts performance on kernel CPU intensive workloads up
+	  to 10% due to PIE generated code. Impact on user-mode processes and
+	  typical usage would be significantly less (0.50% when you build the
+	  kernel).
+
+	  The kernel and modules will generate slightly more assembly (1 to 2%
+	  increase on the .text sections). The vmlinux binary will be
+	  significantly smaller due to less relocations.
+
+	  If unsure say N
+
 config HOTPLUG_CPU
 	bool "Support for hot-pluggable CPUs"
 	depends on SMP
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 1f734cd98fd3..fb72f53defd0 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -116,7 +116,12 @@ $(obj)/vmlinux.bin: vmlinux FORCE
 
 targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs
 
+# Large randomization require bigger relocation table
+ifeq ($(CONFIG_RANDOMIZE_BASE_LARGE),y)
+CMD_RELOCS = arch/x86/tools/relocs --large-reloc
+else
 CMD_RELOCS = arch/x86/tools/relocs
+endif
 quiet_cmd_relocs = RELOCS  $@
       cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $<
 $(obj)/vmlinux.relocs: vmlinux FORCE
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index b50c42455e25..746a968690d5 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -170,10 +170,18 @@ void __puthex(unsigned long value)
 }
 
 #if CONFIG_X86_NEED_RELOCS
+
+/* Large randomization go lower than -2G and use large relocation table */
+#ifdef CONFIG_RANDOMIZE_BASE_LARGE
+typedef long rel_t;
+#else
+typedef int rel_t;
+#endif
+
 static void handle_relocations(void *output, unsigned long output_len,
 			       unsigned long virt_addr)
 {
-	int *reloc;
+	rel_t *reloc;
 	unsigned long delta, map, ptr;
 	unsigned long min_addr = (unsigned long)output;
 	unsigned long max_addr = min_addr + (VO___bss_start - VO__text);
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 2c5a966dc222..85ea681421d2 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -46,7 +46,11 @@
 #define __PAGE_OFFSET           __PAGE_OFFSET_BASE_L4
 #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */
 
+#ifdef CONFIG_RANDOMIZE_BASE_LARGE
+#define __START_KERNEL_map	_AC(0xffffffff00000000, UL)
+#else
 #define __START_KERNEL_map	_AC(0xffffffff80000000, UL)
+#endif /* CONFIG_RANDOMIZE_BASE_LARGE */
 
 /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
 
@@ -64,9 +68,14 @@
  * 512MiB by default, leaving 1.5GiB for modules once the page tables
  * are fully set up. If kernel ASLR is configured, it can extend the
  * kernel page table mapping, reducing the size of the modules area.
+ * On PIE, we relocate the binary 2G lower so add this extra space.
  */
 #if defined(CONFIG_RANDOMIZE_BASE)
+#ifdef CONFIG_RANDOMIZE_BASE_LARGE
+#define KERNEL_IMAGE_SIZE	(_AC(3, UL) * 1024 * 1024 * 1024)
+#else
 #define KERNEL_IMAGE_SIZE	(1024 * 1024 * 1024)
+#endif
 #else
 #define KERNEL_IMAGE_SIZE	(512 * 1024 * 1024)
 #endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index ea4c498369d8..577b47381ba2 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -63,6 +63,7 @@ EXPORT_SYMBOL(vmemmap_base);
 #endif
 
 #define __head	__section(.head.text)
+#define pud_count(x)   (((x + (PUD_SIZE - 1)) & ~(PUD_SIZE - 1)) >> PUD_SHIFT)
 
 /* Required for read_cr3 when building as PIE */
 unsigned long __force_order;
@@ -112,6 +113,8 @@ unsigned long __head __startup_64(unsigned long physaddr,
 {
 	unsigned long load_delta, *p;
 	unsigned long pgtable_flags;
+	unsigned long level3_kernel_start, level3_kernel_count;
+	unsigned long level3_fixmap_start;
 	pgdval_t *pgd;
 	p4dval_t *p4d;
 	pudval_t *pud;
@@ -142,6 +145,11 @@ unsigned long __head __startup_64(unsigned long physaddr,
 	/* Include the SME encryption mask in the fixup value */
 	load_delta += sme_get_me_mask();
 
+	/* Look at the randomization spread to adapt page table used */
+	level3_kernel_start = pud_index(__START_KERNEL_map);
+	level3_kernel_count = pud_count(KERNEL_IMAGE_SIZE);
+	level3_fixmap_start = level3_kernel_start + level3_kernel_count;
+
 	/* Fixup the physical addresses in the page table */
 
 	pgd = fixup_pointer(&early_top_pgt, physaddr);
@@ -158,8 +166,9 @@ unsigned long __head __startup_64(unsigned long physaddr,
 	}
 
 	pud = fixup_pointer(&level3_kernel_pgt, physaddr);
-	pud[510] += load_delta;
-	pud[511] += load_delta;
+	for (i = 0; i < level3_kernel_count; i++)
+		pud[level3_kernel_start + i] += load_delta;
+	pud[level3_fixmap_start] += load_delta;
 
 	pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
 	pmd[506] += load_delta;
@@ -214,7 +223,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
 	 */
 
 	pmd = fixup_pointer(level2_kernel_pgt, physaddr);
-	for (i = 0; i < PTRS_PER_PMD; i++) {
+	for (i = 0; i < PTRS_PER_PMD * level3_kernel_count; i++) {
 		if (pmd[i] & _PAGE_PRESENT)
 			pmd[i] += load_delta;
 	}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 87624e1fe22f..c86d9262015f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -41,12 +41,16 @@
 
 #define l4_index(x)	(((x) >> 39) & 511)
 #define pud_index(x)	(((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
+#define pud_count(x)   (((x + (PUD_SIZE - 1)) & ~(PUD_SIZE - 1)) >> PUD_SHIFT)
 
 L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4)
 L4_START_KERNEL = l4_index(__START_KERNEL_map)
 
 L3_START_KERNEL = pud_index(__START_KERNEL_map)
 
+/* Adapt page table L3 space based on range of randomization */
+L3_KERNEL_ENTRY_COUNT = pud_count(KERNEL_IMAGE_SIZE)
+
 	.text
 	__HEAD
 	.code64
@@ -436,7 +440,12 @@ NEXT_PAGE(level4_kernel_pgt)
 NEXT_PAGE(level3_kernel_pgt)
 	.fill	L3_START_KERNEL,8,0
 	/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
-	.quad	level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
+	i = 0
+	.rept	L3_KERNEL_ENTRY_COUNT
+	.quad	level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC \
+		+ PAGE_SIZE*i
+	i = i + 1
+	.endr
 	.quad	level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 
 NEXT_PAGE(level2_kernel_pgt)
-- 
2.16.2.660.g709887971b-goog

^ permalink raw reply related

* Re: [PATCH v2 06/27] x86/entry/64: Adapt assembly for PIE support
From: Peter Zijlstra @ 2018-03-14 10:29 UTC (permalink / raw)
  To: Thomas Garnier
  Cc: Kate Stewart, Nicolas Pitre, Michal Hocko, Sergey Senozhatsky,
	Petr Mladek, Len Brown, Boris Ostrovsky, Christopher Li,
	Dave Hansen, x86, Dominik Brodowski, linux-kernel,
	Masahiro Yamada, Pavel Machek, H . Peter Anvin, kernel-hardening,
	Christoph Lameter, Jiri Slaby, Alok Kataria, linux-doc,
	linux-arch, Herbert Xu, Baoquan He, Jonathan Corbet, Joerg
In-Reply-To: <20180313205945.245105-7-thgarnie@google.com>

On Tue, Mar 13, 2018 at 01:59:24PM -0700, Thomas Garnier wrote:
> @@ -1576,7 +1578,9 @@ first_nmi:
>  	addq	$8, (%rsp)	/* Fix up RSP */
>  	pushfq			/* RFLAGS */
>  	pushq	$__KERNEL_CS	/* CS */
> -	pushq	$1f		/* RIP */
> +	pushq	%rax		/* Support Position Independent Code */
> +	leaq	1f(%rip), %rax	/* RIP */
> +	xchgq	%rax, (%rsp)	/* Restore RAX, put 1f */
>  	iretq			/* continues at repeat_nmi below */
>  	UNWIND_HINT_IRET_REGS
>  1:

Urgh, xchg with a memop has an implicit LOCK prefix.

^ permalink raw reply

* Re: [PATCH v2 21/27] x86/ftrace: Adapt function tracing for PIE support
From: Peter Zijlstra @ 2018-03-14 10:35 UTC (permalink / raw)
  To: Thomas Garnier
  Cc: Kate Stewart, Nicolas Pitre, Michal Hocko, Sergey Senozhatsky,
	Petr Mladek, Len Brown, Boris Ostrovsky, Christopher Li,
	Dave Hansen, x86, Dominik Brodowski, linux-kernel,
	Masahiro Yamada, Pavel Machek, H . Peter Anvin, kernel-hardening,
	Christoph Lameter, Jiri Slaby, Alok Kataria, linux-doc,
	linux-arch, Herbert Xu, Baoquan He, Jonathan Corbet, Joerg
In-Reply-To: <20180313205945.245105-22-thgarnie@google.com>

On Tue, Mar 13, 2018 at 01:59:39PM -0700, Thomas Garnier wrote:
> +	/*
> +	 * If PIE is not enabled or no GOT call was found, default to the
> +	 * original approach to code modification.
> +	 */
> +	if (!IS_ENABLED(CONFIG_X86_PIE)
> +	    || probe_kernel_read(replaced, (void *)ip, sizeof(replaced))
> +	    || memcmp(replaced, got_call_preinsn, sizeof(got_call_preinsn)))
> +		return ftrace_modify_code_direct(ip, old_code, new_code);

The predominant kernel style is to place the operators at the end of the
previous line, not first on the new line.

^ permalink raw reply

* Re: [PATCH v2 06/27] x86/entry/64: Adapt assembly for PIE support
From: Thomas Garnier via Virtualization @ 2018-03-14 17:09 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Kate Stewart, Nicolas Pitre, Michal Hocko, Sergey Senozhatsky,
	Petr Mladek, Len Brown, Peter Zijlstra, Christopher Li,
	Dave Hansen, the arch/x86 maintainers, Dominik Brodowski, LKML,
	Masahiro Yamada, Pavel Machek, H . Peter Anvin, Kernel Hardening,
	Jiri Slaby, Alok Kataria, Linux Doc Mailing List, linux-arch,
	Herbert Xu, Baoquan He
In-Reply-To: <alpine.DEB.2.20.1803141051210.14471@nuc-kabylake>

On Wed, Mar 14, 2018 at 8:55 AM Christopher Lameter <cl@linux.com> wrote:



> On Wed, 14 Mar 2018, Peter Zijlstra wrote:

> > On Tue, Mar 13, 2018 at 01:59:24PM -0700, Thomas Garnier wrote:
> > > @@ -1576,7 +1578,9 @@ first_nmi:
> > >     addq    $8, (%rsp)      /* Fix up RSP */
> > >     pushfq                  /* RFLAGS */
> > >     pushq   $__KERNEL_CS    /* CS */
> > > -   pushq   $1f             /* RIP */
> > > +   pushq   %rax            /* Support Position Independent Code */
> > > +   leaq    1f(%rip), %rax  /* RIP */
> > > +   xchgq   %rax, (%rsp)    /* Restore RAX, put 1f */
> > >     iretq                   /* continues at repeat_nmi below */
> > >     UNWIND_HINT_IRET_REGS
> > >  1:
> >
> > Urgh, xchg with a memop has an implicit LOCK prefix.

> this_cpu_xchg uses no lock cmpxchg as a replacement to reduce latency.

Great, I will update my implementation.

Thanks Peter and Christoph.


>  From linux/arch/x86/include/asm/percpu.h

> /*
>   * xchg is implemented using cmpxchg without a lock prefix. xchg is
>   * expensive due to the implied lock prefix.  The processor cannot
prefetch
>   * cachelines if xchg is used.
>   */
> #define percpu_xchg_op(var, nval)                                       \
> ({                                                                      \
>          typeof(var) pxo_ret__;                                          \
>          typeof(var) pxo_new__ = (nval);                                 \
>          switch (sizeof(var)) {                                          \
>          case 1:                                                         \
>                  asm("\n\tmov "__percpu_arg(1)",%%al"                    \
>                      "\n1:\tcmpxchgb %2, "__percpu_arg(1)                \
>                      "\n\tjnz 1b"                                        \
>                              : "=&a" (pxo_ret__), "+m" (var)             \
>                              : "q" (pxo_new__)                           \
>                              : "memory");                                \
>                  break;                                                  \
>          case 2:                                                         \
>                  asm("\n\tmov "__percpu_arg(1)",%%ax"                    \
>                      "\n1:\tcmpxchgw %2, "__percpu_arg(1)                \
>                      "\n\tjnz 1b"                                        \
>                              : "=&a" (pxo_ret__), "+m" (var)             \
>                              : "r" (pxo_new__)                           \
>                              : "memory");                                \
>                  break;                                                  \
>          case 4:                                                         \
>                  asm("\n\tmov "__percpu_arg(1)",%%eax"                   \
>                      "\n1:\tcmpxchgl %2, "__percpu_arg(1)                \
>                      "\n\tjnz 1b"                                        \
>                              : "=&a" (pxo_ret__), "+m" (var)             \
>                              : "r" (pxo_new__)                           \
>                              : "memory");                                \
>                  break;                                                  \
>          case 8:                                                         \
>                  asm("\n\tmov "__percpu_arg(1)",%%rax"                   \
>                      "\n1:\tcmpxchgq %2, "__percpu_arg(1)                \
>                      "\n\tjnz 1b"                                        \
>                              : "=&a" (pxo_ret__), "+m" (var)             \
>                              : "r" (pxo_new__)                           \
>                              : "memory");                                \
>                  break;                                                  \
>          default: __bad_percpu_size();                                   \
>          }                                                               \
>          pxo_ret__;                                                      \




-- 
Thomas

^ permalink raw reply

* Re: [PATCH] vhost: add vsock compat ioctl
From: Michael S. Tsirkin @ 2018-03-14 19:05 UTC (permalink / raw)
  To: Sonny Rao; +Cc: kvm, netdev, linux-kernel, virtualization, Stefan Hajnoczi
In-Reply-To: <20180314172605.130483-1-sonnyrao@chromium.org>

On Wed, Mar 14, 2018 at 10:26:05AM -0700, Sonny Rao wrote:
> This will allow usage of vsock from 32-bit binaries on a 64-bit
> kernel.
> 
> Signed-off-by: Sonny Rao <sonnyrao@chromium.org>

I think you need to convert the pointer argument though.
Something along the lines of:

#ifdef CONFIG_COMPAT
static long vhost_vsock_dev_compat_ioctl(struct file *f, unsigned int ioctl,
					 unsigned long arg)
{
        return vhost_vsock_dev_ioctl(f, ioctl, (unsigned long)compat_ptr(arg));
} 
#endif  



> ---
>  drivers/vhost/vsock.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> index 0d14e2ff19f16..d0e65e92110e5 100644
> --- a/drivers/vhost/vsock.c
> +++ b/drivers/vhost/vsock.c
> @@ -705,6 +705,7 @@ static const struct file_operations vhost_vsock_fops = {
>  	.release        = vhost_vsock_dev_release,
>  	.llseek		= noop_llseek,
>  	.unlocked_ioctl = vhost_vsock_dev_ioctl,
> +	.compat_ioctl   = vhost_vsock_dev_ioctl,
>  };
>  
>  static struct miscdevice vhost_vsock_misc = {
> -- 
> 2.13.5

^ permalink raw reply

* Re: [PATCH v2 00/27] x86: PIE support and option to extend KASLR randomization
From: Pavel Machek @ 2018-03-15  8:48 UTC (permalink / raw)
  To: Thomas Garnier
  Cc: Kate Stewart, Nicolas Pitre, Michal Hocko, Sergey Senozhatsky,
	Petr Mladek, Len Brown, Peter Zijlstra, Christopher Li,
	Dave Hansen, x86, Dominik Brodowski, linux-kernel,
	Masahiro Yamada, H . Peter Anvin, kernel-hardening,
	Christoph Lameter, Jiri Slaby, Alok Kataria, linux-doc,
	linux-arch, Herbert Xu, Baoquan He, Jonathan Corbet,
	Boris Ostrovsky, Ra
In-Reply-To: <20180313205945.245105-1-thgarnie@google.com>


[-- Attachment #1.1: Type: text/plain, Size: 735 bytes --]

Hi!

> These patches make the changes necessary to build the kernel as Position
> Independent Executable (PIE) on x86_64. A PIE kernel can be relocated below
> the top 2G of the virtual address space. It allows to optionally extend the
> KASLR randomization range from 1G to 3G.

Would you explain why PIE code is good idea?

You are adding less than 2 bits of randomness. Cost is new config
option, some size and performance impact, and more than 1000 lines of
code...

Is there some grand plan of adding 30 more bits of randomness with
future patch or something?
									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #1.2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

[-- Attachment #2: Type: text/plain, Size: 183 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH v2 06/27] x86/entry/64: Adapt assembly for PIE support
From: Paolo Bonzini @ 2018-03-15 10:18 UTC (permalink / raw)
  To: Christopher Lameter, Peter Zijlstra
  Cc: Kate Stewart, Nicolas Pitre, Michal Hocko, Sergey Senozhatsky,
	Petr Mladek, Len Brown, Boris Ostrovsky, Christopher Li,
	Dave Hansen, x86, Dominik Brodowski, linux-kernel,
	Masahiro Yamada, Pavel Machek, H . Peter Anvin, kernel-hardening,
	Jiri Slaby, Alok Kataria, linux-doc, linux-arch, Herbert Xu,
	Baoquan He, Jonathan Corbet, Joerg Roedel
In-Reply-To: <alpine.DEB.2.20.1803141051210.14471@nuc-kabylake>

On 14/03/2018 16:54, Christopher Lameter wrote:
>>> +	pushq	%rax		/* Support Position Independent Code */
>>> +	leaq	1f(%rip), %rax	/* RIP */
>>> +	xchgq	%rax, (%rsp)	/* Restore RAX, put 1f */
>>>  	iretq			/* continues at repeat_nmi below */
>>>  	UNWIND_HINT_IRET_REGS
>>>  1:
>> Urgh, xchg with a memop has an implicit LOCK prefix.
> this_cpu_xchg uses no lock cmpxchg as a replacement to reduce latency.

That requires using a second register, since %rax is used as the
comparison source.  At this point it's easier to just push %rax twice:

	pushq %rax
	pushq %rax
	leaq 1f(%ip), %rax
	movq %rax, 8(%rsp)
	popq %rax
	iretq

Paolo

^ permalink raw reply

* Re: [PATCH v2] vhost: add vsock compat ioctl
From: Stefan Hajnoczi @ 2018-03-15 13:51 UTC (permalink / raw)
  To: Sonny Rao; +Cc: kvm, Michael S . Tsirkin, netdev, linux-kernel, virtualization
In-Reply-To: <20180314213625.119211-1-sonnyrao@chromium.org>


[-- Attachment #1.1: Type: text/plain, Size: 335 bytes --]

On Wed, Mar 14, 2018 at 02:36:25PM -0700, Sonny Rao wrote:
> This will allow usage of vsock from 32-bit binaries on a 64-bit
> kernel.
> 
> Signed-off-by: Sonny Rao <sonnyrao@chromium.org>
> ---
>  drivers/vhost/vsock.c | 11 +++++++++++
>  1 file changed, 11 insertions(+)

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 455 bytes --]

[-- Attachment #2: Type: text/plain, Size: 183 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH RFC 2/2] virtio_ring: support packed ring
From: Jason Wang @ 2018-03-16  4:03 UTC (permalink / raw)
  To: Tiwei Bie, mst, virtualization, linux-kernel, netdev; +Cc: wexu
In-Reply-To: <20180223111801.15240-3-tiwei.bie@intel.com>



On 2018年02月23日 19:18, Tiwei Bie wrote:
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
>   drivers/virtio/virtio_ring.c | 699 +++++++++++++++++++++++++++++++++++++------
>   include/linux/virtio_ring.h  |   8 +-
>   2 files changed, 618 insertions(+), 89 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index eb30f3e09a47..393778a2f809 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -58,14 +58,14 @@
>   
>   struct vring_desc_state {
>   	void *data;			/* Data for callback. */
> -	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
> +	void *indir_desc;		/* Indirect descriptor, if any. */
> +	int num;			/* Descriptor list length. */
>   };
>   
>   struct vring_virtqueue {
>   	struct virtqueue vq;
>   
> -	/* Actual memory layout for this queue */
> -	struct vring vring;
> +	bool packed;
>   
>   	/* Can we use weak barriers? */
>   	bool weak_barriers;
> @@ -87,11 +87,28 @@ struct vring_virtqueue {
>   	/* Last used index we've seen. */
>   	u16 last_used_idx;
>   
> -	/* Last written value to avail->flags */
> -	u16 avail_flags_shadow;
> -
> -	/* Last written value to avail->idx in guest byte order */
> -	u16 avail_idx_shadow;
> +	union {
> +		/* Available for split ring */
> +		struct {
> +			/* Actual memory layout for this queue */
> +			struct vring vring;
> +
> +			/* Last written value to avail->flags */
> +			u16 avail_flags_shadow;
> +
> +			/* Last written value to avail->idx in
> +			 * guest byte order */
> +			u16 avail_idx_shadow;
> +		};
> +
> +		/* Available for packed ring */
> +		struct {
> +			/* Actual memory layout for this queue */
> +			struct vring_packed vring_packed;
> +			u8 wrap_counter : 1;
> +			bool chaining;
> +		};
> +	};
>   
>   	/* How to notify other side. FIXME: commonalize hcalls! */
>   	bool (*notify)(struct virtqueue *vq);
> @@ -201,26 +218,37 @@ static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
>   			      cpu_addr, size, direction);
>   }
>   
> -static void vring_unmap_one(const struct vring_virtqueue *vq,
> -			    struct vring_desc *desc)
> +static void vring_unmap_one(const struct vring_virtqueue *vq, void *_desc)
>   {

Let's split the helpers to packed/split version like other helpers? 
(Consider the caller has already known the type of vq).

> +	u64 addr;
> +	u32 len;
>   	u16 flags;
>   
>   	if (!vring_use_dma_api(vq->vq.vdev))
>   		return;
>   
> -	flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
> +	if (vq->packed) {
> +		struct vring_packed_desc *desc = _desc;
> +
> +		addr = virtio64_to_cpu(vq->vq.vdev, desc->addr);
> +		len = virtio32_to_cpu(vq->vq.vdev, desc->len);
> +		flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
> +	} else {
> +		struct vring_desc *desc = _desc;
> +
> +		addr = virtio64_to_cpu(vq->vq.vdev, desc->addr);
> +		len = virtio32_to_cpu(vq->vq.vdev, desc->len);
> +		flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
> +	}
>   
>   	if (flags & VRING_DESC_F_INDIRECT) {
>   		dma_unmap_single(vring_dma_dev(vq),
> -				 virtio64_to_cpu(vq->vq.vdev, desc->addr),
> -				 virtio32_to_cpu(vq->vq.vdev, desc->len),
> +				 addr, len,
>   				 (flags & VRING_DESC_F_WRITE) ?
>   				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
>   	} else {
>   		dma_unmap_page(vring_dma_dev(vq),
> -			       virtio64_to_cpu(vq->vq.vdev, desc->addr),
> -			       virtio32_to_cpu(vq->vq.vdev, desc->len),
> +			       addr, len,
>   			       (flags & VRING_DESC_F_WRITE) ?
>   			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
>   	}
> @@ -235,8 +263,9 @@ static int vring_mapping_error(const struct vring_virtqueue *vq,
>   	return dma_mapping_error(vring_dma_dev(vq), addr);
>   }
>   
> -static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
> -					 unsigned int total_sg, gfp_t gfp)
> +static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
> +					       unsigned int total_sg,
> +					       gfp_t gfp)
>   {
>   	struct vring_desc *desc;
>   	unsigned int i;
> @@ -257,14 +286,32 @@ static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
>   	return desc;
>   }
>   
> -static inline int virtqueue_add(struct virtqueue *_vq,
> -				struct scatterlist *sgs[],
> -				unsigned int total_sg,
> -				unsigned int out_sgs,
> -				unsigned int in_sgs,
> -				void *data,
> -				void *ctx,
> -				gfp_t gfp)
> +static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
> +						       unsigned int total_sg,
> +						       gfp_t gfp)
> +{
> +	struct vring_packed_desc *desc;
> +
> +	/*
> +	 * We require lowmem mappings for the descriptors because
> +	 * otherwise virt_to_phys will give us bogus addresses in the
> +	 * virtqueue.
> +	 */
> +	gfp &= ~__GFP_HIGHMEM;
> +
> +	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
> +
> +	return desc;
> +}
> +
> +static inline int virtqueue_add_split(struct virtqueue *_vq,
> +				      struct scatterlist *sgs[],
> +				      unsigned int total_sg,
> +				      unsigned int out_sgs,
> +				      unsigned int in_sgs,
> +				      void *data,
> +				      void *ctx,
> +				      gfp_t gfp)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
>   	struct scatterlist *sg;
> @@ -303,7 +350,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>   	/* If the host supports indirect descriptor tables, and we have multiple
>   	 * buffers, then go indirect. FIXME: tune this threshold */
>   	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
> -		desc = alloc_indirect(_vq, total_sg, gfp);
> +		desc = alloc_indirect_split(_vq, total_sg, gfp);
>   	else {
>   		desc = NULL;
>   		WARN_ON_ONCE(total_sg > vq->vring.num && !vq->indirect);
> @@ -437,6 +484,243 @@ static inline int virtqueue_add(struct virtqueue *_vq,
>   	return -EIO;
>   }
>   
> +static inline int virtqueue_add_packed(struct virtqueue *_vq,
> +				       struct scatterlist *sgs[],
> +				       unsigned int total_sg,
> +				       unsigned int out_sgs,
> +				       unsigned int in_sgs,
> +				       void *data,
> +				       void *ctx,
> +				       gfp_t gfp)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	struct vring_packed_desc *desc;
> +	struct scatterlist *sg;
> +	unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
> +	__virtio16 uninitialized_var(head_flags), flags;
> +	int head, wrap_counter;
> +	bool indirect;
> +
> +	START_USE(vq);
> +
> +	BUG_ON(data == NULL);
> +	BUG_ON(ctx && vq->indirect);
> +
> +	if (unlikely(vq->broken)) {
> +		END_USE(vq);
> +		return -EIO;
> +	}
> +
> +	if (total_sg > 1 && !vq->chaining && !vq->indirect) {
> +		END_USE(vq);
> +		return -ENOTSUPP;
> +	}
> +
> +#ifdef DEBUG
> +	{
> +		ktime_t now = ktime_get();
> +
> +		/* No kick or get, with .1 second between?  Warn. */
> +		if (vq->last_add_time_valid)
> +			WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
> +					    > 100);
> +		vq->last_add_time = now;
> +		vq->last_add_time_valid = true;
> +	}
> +#endif
> +
> +	BUG_ON(total_sg == 0);
> +
> +	head = vq->free_head;
> +	wrap_counter = vq->wrap_counter;
> +
> +	/* If the host supports indirect descriptor tables, and we have multiple
> +	 * buffers, then go indirect. FIXME: tune this threshold */
> +	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
> +		desc = alloc_indirect_packed(_vq, total_sg, gfp);
> +	else {
> +		desc = NULL;
> +		WARN_ON_ONCE(total_sg > vq->vring.num && !vq->indirect);
> +	}
> +
> +	if (desc) {
> +		/* Use a single buffer which doesn't continue */
> +		indirect = true;
> +		/* Set up rest to use this indirect table. */
> +		i = 0;
> +		descs_used = 1;
> +	} else {
> +		indirect = false;
> +		desc = vq->vring_packed.desc;
> +		i = head;
> +		descs_used = total_sg;
> +
> +		if (total_sg > 1 && !vq->chaining) {
> +			END_USE(vq);
> +			return -ENOTSUPP;
> +		}
> +	}
> +
> +	if (vq->vq.num_free < descs_used) {
> +		pr_debug("Can't add buf len %i - avail = %i\n",
> +			 descs_used, vq->vq.num_free);
> +		/* FIXME: for historical reasons, we force a notify here if
> +		 * there are outgoing parts to the buffer.  Presumably the
> +		 * host should service the ring ASAP. */
> +		if (out_sgs)
> +			vq->notify(&vq->vq);
> +		if (indirect)
> +			kfree(desc);
> +		END_USE(vq);
> +		return -ENOSPC;
> +	}
> +
> +	for (n = 0; n < out_sgs; n++) {
> +		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> +			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
> +			if (vring_mapping_error(vq, addr))
> +				goto unmap_release;
> +
> +			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
> +					VRING_DESC_F_AVAIL(vq->wrap_counter) |
> +					VRING_DESC_F_USED(!vq->wrap_counter));
> +			if (!indirect && i == head)
> +				head_flags = flags;
> +			else
> +				desc[i].flags = flags;
> +
> +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> +			desc[i].id = cpu_to_virtio32(_vq->vdev, head);

If it's a part of chain, we only need to do this for last buffer I think.

> +			prev = i;
> +			i++;

It looks to me prev is always i - 1?

> +			if (!indirect && i >= vq->vring_packed.num) {
> +				i = 0;
> +				vq->wrap_counter ^= 1;
> +			}
> +		}
> +	}
> +	for (; n < (out_sgs + in_sgs); n++) {
> +		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> +			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
> +			if (vring_mapping_error(vq, addr))
> +				goto unmap_release;
> +
> +			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
> +					VRING_DESC_F_WRITE |
> +					VRING_DESC_F_AVAIL(vq->wrap_counter) |
> +					VRING_DESC_F_USED(!vq->wrap_counter));
> +			if (!indirect && i == head)
> +				head_flags = flags;
> +			else
> +				desc[i].flags = flags;
> +
> +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> +			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
> +			prev = i;
> +			i++;
> +			if (!indirect && i >= vq->vring_packed.num) {
> +				i = 0;
> +				vq->wrap_counter ^= 1;
> +			}
> +		}
> +	}
> +	/* Last one doesn't continue. */
> +	if (!indirect && (head + 1) % vq->vring_packed.num == i)
> +		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);

I can't get the why we need this here.

> +	else
> +		desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> +
> +	if (indirect) {
> +		/* FIXME: to be implemented */
> +
> +		/* Now that the indirect table is filled in, map it. */
> +		dma_addr_t addr = vring_map_single(
> +			vq, desc, total_sg * sizeof(struct vring_packed_desc),
> +			DMA_TO_DEVICE);
> +		if (vring_mapping_error(vq, addr))
> +			goto unmap_release;
> +
> +		head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
> +					     VRING_DESC_F_AVAIL(wrap_counter) |
> +					     VRING_DESC_F_USED(!wrap_counter));
> +		vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
> +		vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
> +				total_sg * sizeof(struct vring_packed_desc));
> +		vq->vring_packed.desc[head].id = cpu_to_virtio32(_vq->vdev, head);
> +	}
> +
> +	/* We're using some buffers from the free list. */
> +	vq->vq.num_free -= descs_used;
> +
> +	/* Update free pointer */
> +	if (indirect) {
> +		n = head + 1;
> +		if (n >= vq->vring_packed.num) {
> +			n = 0;
> +			vq->wrap_counter ^= 1;
> +		}
> +		vq->free_head = n;

detach_buf_packed() does not even touch free_head here, so need to 
explain its meaning for packed ring.

> +	} else
> +		vq->free_head = i;

ID is only valid in the last descriptor in the list, so head + 1 should 
be ok too?

> +
> +	/* Store token and indirect buffer state. */
> +	vq->desc_state[head].num = descs_used;
> +	vq->desc_state[head].data = data;
> +	if (indirect)
> +		vq->desc_state[head].indir_desc = desc;
> +	else
> +		vq->desc_state[head].indir_desc = ctx;
> +
> +	virtio_wmb(vq->weak_barriers);

Let's add a comment to explain the barrier here.

> +	vq->vring_packed.desc[head].flags = head_flags;
> +	vq->num_added++;
> +
> +	pr_debug("Added buffer head %i to %p\n", head, vq);
> +	END_USE(vq);
> +
> +	return 0;
> +
> +unmap_release:
> +	err_idx = i;
> +	i = head;
> +
> +	for (n = 0; n < total_sg; n++) {
> +		if (i == err_idx)
> +			break;
> +		vring_unmap_one(vq, &desc[i]);
> +		i++;
> +		if (!indirect && i >= vq->vring_packed.num)
> +			i = 0;
> +	}
> +
> +	vq->wrap_counter = wrap_counter;
> +
> +	if (indirect)
> +		kfree(desc);
> +
> +	END_USE(vq);
> +	return -EIO;
> +}
> +
> +static inline int virtqueue_add(struct virtqueue *_vq,
> +				struct scatterlist *sgs[],
> +				unsigned int total_sg,
> +				unsigned int out_sgs,
> +				unsigned int in_sgs,
> +				void *data,
> +				void *ctx,
> +				gfp_t gfp)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	return vq->packed ? virtqueue_add_packed(_vq, sgs, total_sg, out_sgs,
> +						 in_sgs, data, ctx, gfp) :
> +			    virtqueue_add_split(_vq, sgs, total_sg, out_sgs,
> +						in_sgs, data, ctx, gfp);
> +}
> +
>   /**
>    * virtqueue_add_sgs - expose buffers to other end
>    * @vq: the struct virtqueue we're talking about.
> @@ -561,6 +845,12 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
>   	 * event. */
>   	virtio_mb(vq->weak_barriers);
>   
> +	if (vq->packed) {
> +		/* FIXME: to be implemented */
> +		needs_kick = true;
> +		goto out;
> +	}
> +
>   	old = vq->avail_idx_shadow - vq->num_added;
>   	new = vq->avail_idx_shadow;
>   	vq->num_added = 0;
> @@ -579,6 +869,8 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
>   	} else {
>   		needs_kick = !(vq->vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY));
>   	}
> +
> +out:
>   	END_USE(vq);
>   	return needs_kick;
>   }
> @@ -628,8 +920,8 @@ bool virtqueue_kick(struct virtqueue *vq)
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_kick);
>   
> -static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
> -		       void **ctx)
> +static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
> +			     void **ctx)
>   {
>   	unsigned int i, j;
>   	__virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
> @@ -677,29 +969,81 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
>   	}
>   }
>   
> -static inline bool more_used(const struct vring_virtqueue *vq)
> +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> +			      void **ctx)
> +{
> +	struct vring_packed_desc *desc;
> +	unsigned int i, j;
> +
> +	/* Clear data ptr. */
> +	vq->desc_state[head].data = NULL;
> +
> +	i = head;
> +
> +	for (j = 0; j < vq->desc_state[head].num; j++) {
> +		desc = &vq->vring_packed.desc[i];
> +		vring_unmap_one(vq, desc);
> +		i++;
> +		if (i >= vq->vring_packed.num)
> +			i = 0;
> +	}
> +
> +	vq->vq.num_free += vq->desc_state[head].num;

It looks to me vq->free_head grows always, how can we make sure it does 
not exceeds vq.num here?

> +
> +	if (vq->indirect) {
> +		u32 len;
> +
> +		desc = vq->desc_state[head].indir_desc;
> +		/* Free the indirect table, if any, now that it's unmapped. */
> +		if (!desc)
> +			return;
> +
> +		len = virtio32_to_cpu(vq->vq.vdev,
> +				      vq->vring_packed.desc[head].len);
> +
> +		BUG_ON(!(vq->vring_packed.desc[head].flags &
> +			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
> +		BUG_ON(len == 0 || len % sizeof(struct vring_packed_desc));
> +
> +		for (j = 0; j < len / sizeof(struct vring_packed_desc); j++)
> +			vring_unmap_one(vq, &desc[j]);
> +
> +		kfree(desc);
> +		vq->desc_state[head].indir_desc = NULL;
> +	} else if (ctx) {
> +		*ctx = vq->desc_state[head].indir_desc;
> +	}
> +}
> +
> +static inline bool more_used_split(const struct vring_virtqueue *vq)
>   {
>   	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
>   }
>   
> -/**
> - * virtqueue_get_buf - get the next used buffer
> - * @vq: the struct virtqueue we're talking about.
> - * @len: the length written into the buffer
> - *
> - * If the device wrote data into the buffer, @len will be set to the
> - * amount written.  This means you don't need to clear the buffer
> - * beforehand to ensure there's no data leakage in the case of short
> - * writes.
> - *
> - * Caller must ensure we don't call this with other virtqueue
> - * operations at the same time (except where noted).
> - *
> - * Returns NULL if there are no used buffers, or the "data" token
> - * handed to virtqueue_add_*().
> - */
> -void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
> -			    void **ctx)
> +static inline bool more_used_packed(const struct vring_virtqueue *vq)
> +{
> +	u16 last_used, flags;
> +	bool avail, used;
> +
> +	if (vq->vq.num_free == vq->vring.num)
> +		return false;
> +
> +	last_used = vq->last_used_idx;
> +	flags = virtio16_to_cpu(vq->vq.vdev,
> +				vq->vring_packed.desc[last_used].flags);
> +	avail = flags & VRING_DESC_F_AVAIL(1);
> +	used = flags & VRING_DESC_F_USED(1);
> +
> +	return avail == used;
> +}
> +
> +static inline bool more_used(const struct vring_virtqueue *vq)
> +{
> +	return vq->packed ? more_used_packed(vq) : more_used_split(vq);
> +}
> +
> +void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, unsigned int *len,
> +				  void **ctx)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
>   	void *ret;
> @@ -735,9 +1079,9 @@ void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
>   		return NULL;
>   	}
>   
> -	/* detach_buf clears data, so grab it now. */
> +	/* detach_buf_split clears data, so grab it now. */
>   	ret = vq->desc_state[i].data;
> -	detach_buf(vq, i, ctx);
> +	detach_buf_split(vq, i, ctx);
>   	vq->last_used_idx++;
>   	/* If we expect an interrupt for the next entry, tell host
>   	 * by writing event index and flush out the write before
> @@ -754,6 +1098,87 @@ void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
>   	END_USE(vq);
>   	return ret;
>   }
> +
> +void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq, unsigned int *len,
> +				   void **ctx)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +	void *ret;
> +	unsigned int i;
> +	u16 last_used;
> +
> +	START_USE(vq);
> +
> +	if (unlikely(vq->broken)) {
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	if (!more_used(vq)) {
> +		pr_debug("No more buffers in queue\n");
> +		END_USE(vq);
> +		return NULL;
> +	}
> +
> +	/* Only get used array entries after they have been exposed by host. */
> +	virtio_rmb(vq->weak_barriers);
> +
> +	last_used = vq->last_used_idx;
> +
> +	i = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].id);
> +	*len = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].len);
> +
> +	if (unlikely(i >= vq->vring_packed.num)) {
> +		BAD_RING(vq, "id %u out of range\n", i);
> +		return NULL;
> +	}
> +	if (unlikely(!vq->desc_state[i].data)) {
> +		BAD_RING(vq, "id %u is not a head!\n", i);
> +		return NULL;
> +	}
> +
> +	/* detach_buf_packed clears data, so grab it now. */
> +	ret = vq->desc_state[i].data;
> +	detach_buf_packed(vq, i, ctx);
> +
> +	vq->last_used_idx += vq->desc_state[i].num;
> +	if (vq->last_used_idx >= vq->vring_packed.num)
> +		vq->last_used_idx %= vq->vring_packed.num;

'-=' should be sufficient here?

> +
> +	// FIXME: implement the desc event support
> +
> +#ifdef DEBUG
> +	vq->last_add_time_valid = false;
> +#endif
> +
> +	END_USE(vq);
> +	return ret;
> +}
> +
> +/**
> + * virtqueue_get_buf - get the next used buffer
> + * @vq: the struct virtqueue we're talking about.
> + * @len: the length written into the buffer
> + *
> + * If the device wrote data into the buffer, @len will be set to the
> + * amount written.  This means you don't need to clear the buffer
> + * beforehand to ensure there's no data leakage in the case of short
> + * writes.
> + *
> + * Caller must ensure we don't call this with other virtqueue
> + * operations at the same time (except where noted).
> + *
> + * Returns NULL if there are no used buffers, or the "data" token
> + * handed to virtqueue_add_*().
> + */
> +void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
> +			    void **ctx)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	return vq->packed ? virtqueue_get_buf_ctx_packed(_vq, len, ctx) :
> +			    virtqueue_get_buf_ctx_split(_vq, len, ctx);
> +}
>   EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx);
>   
>   void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
> @@ -761,6 +1186,24 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
>   	return virtqueue_get_buf_ctx(_vq, len, NULL);
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_get_buf);
> +
> +static void virtqueue_disable_cb_split(struct virtqueue *_vq)
> +{
> +	struct vring_virtqueue *vq = to_vvq(_vq);
> +
> +	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
> +		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> +		if (!vq->event)
> +			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev,
> +							vq->avail_flags_shadow);
> +	}
> +}
> +
> +static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
> +{
> +	// FIXME: to be implemented
> +}
> +
>   /**
>    * virtqueue_disable_cb - disable callbacks
>    * @vq: the struct virtqueue we're talking about.
> @@ -774,12 +1217,10 @@ void virtqueue_disable_cb(struct virtqueue *_vq)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
>   
> -	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
> -		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> -		if (!vq->event)
> -			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> -	}
> -
> +	if (vq->packed)
> +		virtqueue_disable_cb_packed(_vq);
> +	else
> +		virtqueue_disable_cb_split(_vq);
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
>   
> @@ -802,6 +1243,12 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
>   
>   	START_USE(vq);
>   
> +	if (vq->packed) {
> +		// FIXME: to be implemented
> +		last_used_idx = vq->last_used_idx;
> +		goto out;
> +	}
> +
>   	/* We optimistically turn back on interrupts, then check if there was
>   	 * more to do. */
>   	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> @@ -813,6 +1260,7 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
>   			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
>   	}
>   	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
> +out:
>   	END_USE(vq);
>   	return last_used_idx;
>   }
> @@ -832,6 +1280,12 @@ bool virtqueue_poll(struct virtqueue *_vq, unsigned last_used_idx)
>   	struct vring_virtqueue *vq = to_vvq(_vq);
>   
>   	virtio_mb(vq->weak_barriers);
> +	if (vq->packed) {
> +		u16 flags = virtio16_to_cpu(vq->vq.vdev,
> +				vq->vring_packed.desc[last_used_idx].flags);
> +		return !(flags & VRING_DESC_F_AVAIL(1)) ==
> +		       !(flags & VRING_DESC_F_USED(1));
> +	}
>   	return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_poll);
> @@ -874,6 +1328,11 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
>   
>   	START_USE(vq);
>   
> +	if (vq->packed) {
> +		// FIXME: to be implemented
> +		goto out;
> +	}
> +
>   	/* We optimistically turn back on interrupts, then check if there was
>   	 * more to do. */
>   	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> @@ -896,6 +1355,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
>   		return false;
>   	}
>   
> +out:
>   	END_USE(vq);
>   	return true;
>   }
> @@ -922,14 +1382,20 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
>   			continue;
>   		/* detach_buf clears data, so grab it now. */
>   		buf = vq->desc_state[i].data;
> -		detach_buf(vq, i, NULL);
> -		vq->avail_idx_shadow--;
> -		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
> +		if (vq->packed)
> +			detach_buf_packed(vq, i, NULL);
> +		else {
> +			detach_buf_split(vq, i, NULL);
> +			vq->avail_idx_shadow--;
> +			vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev,
> +							vq->avail_idx_shadow);
> +		}
>   		END_USE(vq);
>   		return buf;
>   	}
>   	/* That should have freed everything. */
> -	BUG_ON(vq->vq.num_free != vq->vring.num);
> +	BUG_ON(vq->vq.num_free != (vq->packed ? vq->vring_packed.num :
> +						vq->vring.num));
>   
>   	END_USE(vq);
>   	return NULL;
> @@ -957,7 +1423,8 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
>   EXPORT_SYMBOL_GPL(vring_interrupt);
>   
>   struct virtqueue *__vring_new_virtqueue(unsigned int index,
> -					struct vring vring,
> +					union vring_union vring,
> +					bool packed,
>   					struct virtio_device *vdev,
>   					bool weak_barriers,
>   					bool context,
> @@ -965,19 +1432,20 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
>   					void (*callback)(struct virtqueue *),
>   					const char *name)
>   {
> -	unsigned int i;
> +	unsigned int num, i;
>   	struct vring_virtqueue *vq;
>   
> -	vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state),
> +	num = packed ? vring.vring_packed.num : vring.vring_split.num;
> +
> +	vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
>   		     GFP_KERNEL);
>   	if (!vq)
>   		return NULL;
>   
> -	vq->vring = vring;
>   	vq->vq.callback = callback;
>   	vq->vq.vdev = vdev;
>   	vq->vq.name = name;
> -	vq->vq.num_free = vring.num;
> +	vq->vq.num_free = num;
>   	vq->vq.index = index;
>   	vq->we_own_ring = false;
>   	vq->queue_dma_addr = 0;
> @@ -986,9 +1454,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
>   	vq->weak_barriers = weak_barriers;
>   	vq->broken = false;
>   	vq->last_used_idx = 0;
> -	vq->avail_flags_shadow = 0;
> -	vq->avail_idx_shadow = 0;
>   	vq->num_added = 0;
> +	vq->packed = packed;
>   	list_add_tail(&vq->vq.list, &vdev->vqs);
>   #ifdef DEBUG
>   	vq->in_use = false;
> @@ -999,18 +1466,41 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
>   		!context;
>   	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
>   
> +	if (vq->packed) {
> +		vq->vring_packed = vring.vring_packed;
> +		vq->free_head = 0;
> +		vq->wrap_counter = 1;
> +
> +#if 0
> +		vq->chaining = virtio_has_feature(vdev,
> +						  VIRTIO_RING_F_LIST_DESC);
> +#else
> +		vq->chaining = true;

Looks like in V10 there's no F_LIST_DESC.

> +#endif
> +	} else {
> +		vq->vring = vring.vring_split;
> +		vq->avail_flags_shadow = 0;
> +		vq->avail_idx_shadow = 0;
> +
> +		/* Put everything in free lists. */
> +		vq->free_head = 0;
> +		for (i = 0; i < num-1; i++)
> +			vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
> +	}
> +
>   	/* No callback?  Tell other side not to bother us. */
>   	if (!callback) {
> -		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> -		if (!vq->event)
> -			vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
> +		if (packed) {
> +			// FIXME: to be implemented
> +		} else {
> +			vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> +			if (!vq->event)
> +				vq->vring.avail->flags = cpu_to_virtio16(vdev,
> +						vq->avail_flags_shadow);
> +		}
>   	}
>   
> -	/* Put everything in free lists. */
> -	vq->free_head = 0;
> -	for (i = 0; i < vring.num-1; i++)
> -		vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
> -	memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
> +	memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
>   
>   	return &vq->vq;
>   }
> @@ -1058,6 +1548,14 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size,
>   	}
>   }
>   
> +static inline int
> +__vring_size(unsigned int num, unsigned long align, bool packed)
> +{
> +	if (packed)
> +		return vring_packed_size(num, align);
> +	return vring_size(num, align);
> +}
> +
>   struct virtqueue *vring_create_virtqueue(
>   	unsigned int index,
>   	unsigned int num,
> @@ -1074,7 +1572,8 @@ struct virtqueue *vring_create_virtqueue(
>   	void *queue = NULL;
>   	dma_addr_t dma_addr;
>   	size_t queue_size_in_bytes;
> -	struct vring vring;
> +	union vring_union vring;
> +	bool packed;
>   
>   	/* We assume num is a power of 2. */
>   	if (num & (num - 1)) {
> @@ -1082,9 +1581,13 @@ struct virtqueue *vring_create_virtqueue(
>   		return NULL;
>   	}
>   
> +	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
> +
>   	/* TODO: allocate each queue chunk individually */
> -	for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
> -		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
> +	for (; num && __vring_size(num, vring_align, packed) > PAGE_SIZE;
> +			num /= 2) {
> +		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
> +							     packed),
>   					  &dma_addr,
>   					  GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
>   		if (queue)
> @@ -1096,17 +1599,21 @@ struct virtqueue *vring_create_virtqueue(
>   
>   	if (!queue) {
>   		/* Try to get a single page. You are my only hope! */
> -		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
> +		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
> +							     packed),
>   					  &dma_addr, GFP_KERNEL|__GFP_ZERO);
>   	}
>   	if (!queue)
>   		return NULL;
>   
> -	queue_size_in_bytes = vring_size(num, vring_align);
> -	vring_init(&vring, num, queue, vring_align);
> +	queue_size_in_bytes = __vring_size(num, vring_align, packed);
> +	if (packed)
> +		vring_packed_init(&vring.vring_packed, num, queue, vring_align);
> +	else
> +		vring_init(&vring.vring_split, num, queue, vring_align);

Let's rename vring_init to vring_init_split() like other helpers?

>   
> -	vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
> -				   notify, callback, name);
> +	vq = __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
> +				   context, notify, callback, name);
>   	if (!vq) {
>   		vring_free_queue(vdev, queue_size_in_bytes, queue,
>   				 dma_addr);
> @@ -1132,10 +1639,17 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
>   				      void (*callback)(struct virtqueue *vq),
>   				      const char *name)
>   {
> -	struct vring vring;
> -	vring_init(&vring, num, pages, vring_align);
> -	return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
> -				     notify, callback, name);
> +	union vring_union vring;
> +	bool packed;
> +
> +	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
> +	if (packed)
> +		vring_packed_init(&vring.vring_packed, num, pages, vring_align);
> +	else
> +		vring_init(&vring.vring_split, num, pages, vring_align);
> +
> +	return __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
> +				     context, notify, callback, name);
>   }
>   EXPORT_SYMBOL_GPL(vring_new_virtqueue);
>   
> @@ -1145,7 +1659,9 @@ void vring_del_virtqueue(struct virtqueue *_vq)
>   
>   	if (vq->we_own_ring) {
>   		vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
> -				 vq->vring.desc, vq->queue_dma_addr);
> +				 vq->packed ? (void *)vq->vring_packed.desc :
> +					      (void *)vq->vring.desc,
> +				 vq->queue_dma_addr);
>   	}
>   	list_del(&_vq->list);
>   	kfree(vq);
> @@ -1159,14 +1675,18 @@ void vring_transport_features(struct virtio_device *vdev)
>   
>   	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
>   		switch (i) {
> +#if 0 // FIXME: to be implemented
>   		case VIRTIO_RING_F_INDIRECT_DESC:
>   			break;
> +#endif
>   		case VIRTIO_RING_F_EVENT_IDX:
>   			break;
>   		case VIRTIO_F_VERSION_1:
>   			break;
>   		case VIRTIO_F_IOMMU_PLATFORM:
>   			break;
> +		case VIRTIO_F_RING_PACKED:
> +			break;
>   		default:
>   			/* We don't understand this bit. */
>   			__virtio_clear_bit(vdev, i);
> @@ -1187,7 +1707,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
>   
>   	struct vring_virtqueue *vq = to_vvq(_vq);
>   
> -	return vq->vring.num;
> +	return vq->packed ? vq->vring_packed.num : vq->vring.num;
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
>   
> @@ -1224,6 +1744,7 @@ dma_addr_t virtqueue_get_desc_addr(struct virtqueue *_vq)
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_get_desc_addr);
>   
> +/* Only available for split ring */

Interesting, I think we need this for correctly configure pci. e.g in 
setup_vq()?

>   dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
> @@ -1235,6 +1756,7 @@ dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_get_avail_addr);
>   
> +/* Only available for split ring */
>   dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)

Maybe it's better to rename this to get_device_addr().

Thanks

>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
> @@ -1246,6 +1768,7 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
>   }
>   EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
>   
> +/* Only available for split ring */
>   const struct vring *virtqueue_get_vring(struct virtqueue *vq)
>   {
>   	return &to_vvq(vq)->vring;
> diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
> index bbf32524ab27..a0075894ad16 100644
> --- a/include/linux/virtio_ring.h
> +++ b/include/linux/virtio_ring.h
> @@ -60,6 +60,11 @@ static inline void virtio_store_mb(bool weak_barriers,
>   struct virtio_device;
>   struct virtqueue;
>   
> +union vring_union {
> +	struct vring vring_split;
> +	struct vring_packed vring_packed;
> +};
> +
>   /*
>    * Creates a virtqueue and allocates the descriptor ring.  If
>    * may_reduce_num is set, then this may allocate a smaller ring than
> @@ -79,7 +84,8 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
>   
>   /* Creates a virtqueue with a custom layout. */
>   struct virtqueue *__vring_new_virtqueue(unsigned int index,
> -					struct vring vring,
> +					union vring_union vring,
> +					bool packed,
>   					struct virtio_device *vdev,
>   					bool weak_barriers,
>   					bool ctx,

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH RFC 2/2] virtio_ring: support packed ring
From: Tiwei Bie @ 2018-03-16  6:10 UTC (permalink / raw)
  To: Jason Wang; +Cc: mst, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <8f73267a-e20e-de64-d8e0-3fd608dbf368@redhat.com>

On Fri, Mar 16, 2018 at 12:03:25PM +0800, Jason Wang wrote:
> On 2018年02月23日 19:18, Tiwei Bie wrote:
> > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > ---
> >   drivers/virtio/virtio_ring.c | 699 +++++++++++++++++++++++++++++++++++++------
> >   include/linux/virtio_ring.h  |   8 +-
> >   2 files changed, 618 insertions(+), 89 deletions(-)
> > 
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > index eb30f3e09a47..393778a2f809 100644
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -58,14 +58,14 @@
> >   struct vring_desc_state {
> >   	void *data;			/* Data for callback. */
> > -	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
> > +	void *indir_desc;		/* Indirect descriptor, if any. */
> > +	int num;			/* Descriptor list length. */
> >   };
> >   struct vring_virtqueue {
> >   	struct virtqueue vq;
> > -	/* Actual memory layout for this queue */
> > -	struct vring vring;
> > +	bool packed;
> >   	/* Can we use weak barriers? */
> >   	bool weak_barriers;
> > @@ -87,11 +87,28 @@ struct vring_virtqueue {
> >   	/* Last used index we've seen. */
> >   	u16 last_used_idx;
> > -	/* Last written value to avail->flags */
> > -	u16 avail_flags_shadow;
> > -
> > -	/* Last written value to avail->idx in guest byte order */
> > -	u16 avail_idx_shadow;
> > +	union {
> > +		/* Available for split ring */
> > +		struct {
> > +			/* Actual memory layout for this queue */
> > +			struct vring vring;
> > +
> > +			/* Last written value to avail->flags */
> > +			u16 avail_flags_shadow;
> > +
> > +			/* Last written value to avail->idx in
> > +			 * guest byte order */
> > +			u16 avail_idx_shadow;
> > +		};
> > +
> > +		/* Available for packed ring */
> > +		struct {
> > +			/* Actual memory layout for this queue */
> > +			struct vring_packed vring_packed;
> > +			u8 wrap_counter : 1;
> > +			bool chaining;
> > +		};
> > +	};
> >   	/* How to notify other side. FIXME: commonalize hcalls! */
> >   	bool (*notify)(struct virtqueue *vq);
> > @@ -201,26 +218,37 @@ static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
> >   			      cpu_addr, size, direction);
> >   }
> > -static void vring_unmap_one(const struct vring_virtqueue *vq,
> > -			    struct vring_desc *desc)
> > +static void vring_unmap_one(const struct vring_virtqueue *vq, void *_desc)
> >   {
> 
> Let's split the helpers to packed/split version like other helpers?
> (Consider the caller has already known the type of vq).

Okay.

> 
> > +	u64 addr;
> > +	u32 len;
> >   	u16 flags;
> >   	if (!vring_use_dma_api(vq->vq.vdev))
> >   		return;
> > -	flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
> > +	if (vq->packed) {
> > +		struct vring_packed_desc *desc = _desc;
> > +
> > +		addr = virtio64_to_cpu(vq->vq.vdev, desc->addr);
> > +		len = virtio32_to_cpu(vq->vq.vdev, desc->len);
> > +		flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
> > +	} else {
> > +		struct vring_desc *desc = _desc;
> > +
> > +		addr = virtio64_to_cpu(vq->vq.vdev, desc->addr);
> > +		len = virtio32_to_cpu(vq->vq.vdev, desc->len);
> > +		flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
> > +	}
> >   	if (flags & VRING_DESC_F_INDIRECT) {
> >   		dma_unmap_single(vring_dma_dev(vq),
> > -				 virtio64_to_cpu(vq->vq.vdev, desc->addr),
> > -				 virtio32_to_cpu(vq->vq.vdev, desc->len),
> > +				 addr, len,
> >   				 (flags & VRING_DESC_F_WRITE) ?
> >   				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
> >   	} else {
> >   		dma_unmap_page(vring_dma_dev(vq),
> > -			       virtio64_to_cpu(vq->vq.vdev, desc->addr),
> > -			       virtio32_to_cpu(vq->vq.vdev, desc->len),
> > +			       addr, len,
> >   			       (flags & VRING_DESC_F_WRITE) ?
> >   			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
> >   	}
> > @@ -235,8 +263,9 @@ static int vring_mapping_error(const struct vring_virtqueue *vq,
> >   	return dma_mapping_error(vring_dma_dev(vq), addr);
> >   }
> > -static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
> > -					 unsigned int total_sg, gfp_t gfp)
> > +static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
> > +					       unsigned int total_sg,
> > +					       gfp_t gfp)
> >   {
> >   	struct vring_desc *desc;
> >   	unsigned int i;
> > @@ -257,14 +286,32 @@ static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
> >   	return desc;
> >   }
> > -static inline int virtqueue_add(struct virtqueue *_vq,
> > -				struct scatterlist *sgs[],
> > -				unsigned int total_sg,
> > -				unsigned int out_sgs,
> > -				unsigned int in_sgs,
> > -				void *data,
> > -				void *ctx,
> > -				gfp_t gfp)
> > +static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
> > +						       unsigned int total_sg,
> > +						       gfp_t gfp)
> > +{
> > +	struct vring_packed_desc *desc;
> > +
> > +	/*
> > +	 * We require lowmem mappings for the descriptors because
> > +	 * otherwise virt_to_phys will give us bogus addresses in the
> > +	 * virtqueue.
> > +	 */
> > +	gfp &= ~__GFP_HIGHMEM;
> > +
> > +	desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
> > +
> > +	return desc;
> > +}
> > +
> > +static inline int virtqueue_add_split(struct virtqueue *_vq,
> > +				      struct scatterlist *sgs[],
> > +				      unsigned int total_sg,
> > +				      unsigned int out_sgs,
> > +				      unsigned int in_sgs,
> > +				      void *data,
> > +				      void *ctx,
> > +				      gfp_t gfp)
> >   {
> >   	struct vring_virtqueue *vq = to_vvq(_vq);
> >   	struct scatterlist *sg;
> > @@ -303,7 +350,7 @@ static inline int virtqueue_add(struct virtqueue *_vq,
> >   	/* If the host supports indirect descriptor tables, and we have multiple
> >   	 * buffers, then go indirect. FIXME: tune this threshold */
> >   	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
> > -		desc = alloc_indirect(_vq, total_sg, gfp);
> > +		desc = alloc_indirect_split(_vq, total_sg, gfp);
> >   	else {
> >   		desc = NULL;
> >   		WARN_ON_ONCE(total_sg > vq->vring.num && !vq->indirect);
> > @@ -437,6 +484,243 @@ static inline int virtqueue_add(struct virtqueue *_vq,
> >   	return -EIO;
> >   }
> > +static inline int virtqueue_add_packed(struct virtqueue *_vq,
> > +				       struct scatterlist *sgs[],
> > +				       unsigned int total_sg,
> > +				       unsigned int out_sgs,
> > +				       unsigned int in_sgs,
> > +				       void *data,
> > +				       void *ctx,
> > +				       gfp_t gfp)
> > +{
> > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > +	struct vring_packed_desc *desc;
> > +	struct scatterlist *sg;
> > +	unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
> > +	__virtio16 uninitialized_var(head_flags), flags;
> > +	int head, wrap_counter;
> > +	bool indirect;
> > +
> > +	START_USE(vq);
> > +
> > +	BUG_ON(data == NULL);
> > +	BUG_ON(ctx && vq->indirect);
> > +
> > +	if (unlikely(vq->broken)) {
> > +		END_USE(vq);
> > +		return -EIO;
> > +	}
> > +
> > +	if (total_sg > 1 && !vq->chaining && !vq->indirect) {
> > +		END_USE(vq);
> > +		return -ENOTSUPP;
> > +	}
> > +
> > +#ifdef DEBUG
> > +	{
> > +		ktime_t now = ktime_get();
> > +
> > +		/* No kick or get, with .1 second between?  Warn. */
> > +		if (vq->last_add_time_valid)
> > +			WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
> > +					    > 100);
> > +		vq->last_add_time = now;
> > +		vq->last_add_time_valid = true;
> > +	}
> > +#endif
> > +
> > +	BUG_ON(total_sg == 0);
> > +
> > +	head = vq->free_head;
> > +	wrap_counter = vq->wrap_counter;
> > +
> > +	/* If the host supports indirect descriptor tables, and we have multiple
> > +	 * buffers, then go indirect. FIXME: tune this threshold */
> > +	if (vq->indirect && total_sg > 1 && vq->vq.num_free)
> > +		desc = alloc_indirect_packed(_vq, total_sg, gfp);
> > +	else {
> > +		desc = NULL;
> > +		WARN_ON_ONCE(total_sg > vq->vring.num && !vq->indirect);
> > +	}
> > +
> > +	if (desc) {
> > +		/* Use a single buffer which doesn't continue */
> > +		indirect = true;
> > +		/* Set up rest to use this indirect table. */
> > +		i = 0;
> > +		descs_used = 1;
> > +	} else {
> > +		indirect = false;
> > +		desc = vq->vring_packed.desc;
> > +		i = head;
> > +		descs_used = total_sg;
> > +
> > +		if (total_sg > 1 && !vq->chaining) {
> > +			END_USE(vq);
> > +			return -ENOTSUPP;
> > +		}
> > +	}
> > +
> > +	if (vq->vq.num_free < descs_used) {
> > +		pr_debug("Can't add buf len %i - avail = %i\n",
> > +			 descs_used, vq->vq.num_free);
> > +		/* FIXME: for historical reasons, we force a notify here if
> > +		 * there are outgoing parts to the buffer.  Presumably the
> > +		 * host should service the ring ASAP. */
> > +		if (out_sgs)
> > +			vq->notify(&vq->vq);
> > +		if (indirect)
> > +			kfree(desc);
> > +		END_USE(vq);
> > +		return -ENOSPC;
> > +	}
> > +
> > +	for (n = 0; n < out_sgs; n++) {
> > +		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > +			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
> > +			if (vring_mapping_error(vq, addr))
> > +				goto unmap_release;
> > +
> > +			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
> > +					VRING_DESC_F_AVAIL(vq->wrap_counter) |
> > +					VRING_DESC_F_USED(!vq->wrap_counter));
> > +			if (!indirect && i == head)
> > +				head_flags = flags;
> > +			else
> > +				desc[i].flags = flags;
> > +
> > +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> > +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> > +			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
> 
> If it's a part of chain, we only need to do this for last buffer I think.

I'm not sure I've got your point about the "last buffer".
But, yes, id just needs to be set for the last desc.

> 
> > +			prev = i;
> > +			i++;
> 
> It looks to me prev is always i - 1?

No. prev will be (vq->vring_packed.num - 1) when i becomes 0.

> 
> > +			if (!indirect && i >= vq->vring_packed.num) {
> > +				i = 0;
> > +				vq->wrap_counter ^= 1;
> > +			}
> > +		}
> > +	}
> > +	for (; n < (out_sgs + in_sgs); n++) {
> > +		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > +			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
> > +			if (vring_mapping_error(vq, addr))
> > +				goto unmap_release;
> > +
> > +			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
> > +					VRING_DESC_F_WRITE |
> > +					VRING_DESC_F_AVAIL(vq->wrap_counter) |
> > +					VRING_DESC_F_USED(!vq->wrap_counter));
> > +			if (!indirect && i == head)
> > +				head_flags = flags;
> > +			else
> > +				desc[i].flags = flags;
> > +
> > +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> > +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> > +			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
> > +			prev = i;
> > +			i++;
> > +			if (!indirect && i >= vq->vring_packed.num) {
> > +				i = 0;
> > +				vq->wrap_counter ^= 1;
> > +			}
> > +		}
> > +	}
> > +	/* Last one doesn't continue. */
> > +	if (!indirect && (head + 1) % vq->vring_packed.num == i)
> > +		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> 
> I can't get the why we need this here.

If only one desc is used, we will need to clear the
VRING_DESC_F_NEXT flag from the head_flags.

> 
> > +	else
> > +		desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> > +
> > +	if (indirect) {
> > +		/* FIXME: to be implemented */
> > +
> > +		/* Now that the indirect table is filled in, map it. */
> > +		dma_addr_t addr = vring_map_single(
> > +			vq, desc, total_sg * sizeof(struct vring_packed_desc),
> > +			DMA_TO_DEVICE);
> > +		if (vring_mapping_error(vq, addr))
> > +			goto unmap_release;
> > +
> > +		head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
> > +					     VRING_DESC_F_AVAIL(wrap_counter) |
> > +					     VRING_DESC_F_USED(!wrap_counter));
> > +		vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
> > +		vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
> > +				total_sg * sizeof(struct vring_packed_desc));
> > +		vq->vring_packed.desc[head].id = cpu_to_virtio32(_vq->vdev, head);
> > +	}
> > +
> > +	/* We're using some buffers from the free list. */
> > +	vq->vq.num_free -= descs_used;
> > +
> > +	/* Update free pointer */
> > +	if (indirect) {
> > +		n = head + 1;
> > +		if (n >= vq->vring_packed.num) {
> > +			n = 0;
> > +			vq->wrap_counter ^= 1;
> > +		}
> > +		vq->free_head = n;
> 
> detach_buf_packed() does not even touch free_head here, so need to explain
> its meaning for packed ring.

Above code is for indirect support which isn't really
implemented in this patch yet.

For your question, free_head stores the index of the
next avail desc. I'll add a comment for it or move it
to union and give it a better name in next version.

> 
> > +	} else
> > +		vq->free_head = i;
> 
> ID is only valid in the last descriptor in the list, so head + 1 should be
> ok too?

I don't really get your point. The vq->free_head stores
the index of the next avail desc.

> 
> > +
> > +	/* Store token and indirect buffer state. */
> > +	vq->desc_state[head].num = descs_used;
> > +	vq->desc_state[head].data = data;
> > +	if (indirect)
> > +		vq->desc_state[head].indir_desc = desc;
> > +	else
> > +		vq->desc_state[head].indir_desc = ctx;
> > +
> > +	virtio_wmb(vq->weak_barriers);
> 
> Let's add a comment to explain the barrier here.

Okay.

> 
> > +	vq->vring_packed.desc[head].flags = head_flags;
> > +	vq->num_added++;
> > +
> > +	pr_debug("Added buffer head %i to %p\n", head, vq);
> > +	END_USE(vq);
> > +
> > +	return 0;
> > +
> > +unmap_release:
> > +	err_idx = i;
> > +	i = head;
> > +
> > +	for (n = 0; n < total_sg; n++) {
> > +		if (i == err_idx)
> > +			break;
> > +		vring_unmap_one(vq, &desc[i]);
> > +		i++;
> > +		if (!indirect && i >= vq->vring_packed.num)
> > +			i = 0;
> > +	}
> > +
> > +	vq->wrap_counter = wrap_counter;
> > +
> > +	if (indirect)
> > +		kfree(desc);
> > +
> > +	END_USE(vq);
> > +	return -EIO;
> > +}
> > +
> > +static inline int virtqueue_add(struct virtqueue *_vq,
> > +				struct scatterlist *sgs[],
> > +				unsigned int total_sg,
> > +				unsigned int out_sgs,
> > +				unsigned int in_sgs,
> > +				void *data,
> > +				void *ctx,
> > +				gfp_t gfp)
> > +{
> > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > +
> > +	return vq->packed ? virtqueue_add_packed(_vq, sgs, total_sg, out_sgs,
> > +						 in_sgs, data, ctx, gfp) :
> > +			    virtqueue_add_split(_vq, sgs, total_sg, out_sgs,
> > +						in_sgs, data, ctx, gfp);
> > +}
> > +
> >   /**
> >    * virtqueue_add_sgs - expose buffers to other end
> >    * @vq: the struct virtqueue we're talking about.
> > @@ -561,6 +845,12 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
> >   	 * event. */
> >   	virtio_mb(vq->weak_barriers);
> > +	if (vq->packed) {
> > +		/* FIXME: to be implemented */
> > +		needs_kick = true;
> > +		goto out;
> > +	}
> > +
> >   	old = vq->avail_idx_shadow - vq->num_added;
> >   	new = vq->avail_idx_shadow;
> >   	vq->num_added = 0;
> > @@ -579,6 +869,8 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
> >   	} else {
> >   		needs_kick = !(vq->vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY));
> >   	}
> > +
> > +out:
> >   	END_USE(vq);
> >   	return needs_kick;
> >   }
> > @@ -628,8 +920,8 @@ bool virtqueue_kick(struct virtqueue *vq)
> >   }
> >   EXPORT_SYMBOL_GPL(virtqueue_kick);
> > -static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
> > -		       void **ctx)
> > +static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
> > +			     void **ctx)
> >   {
> >   	unsigned int i, j;
> >   	__virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
> > @@ -677,29 +969,81 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
> >   	}
> >   }
> > -static inline bool more_used(const struct vring_virtqueue *vq)
> > +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
> > +			      void **ctx)
> > +{
> > +	struct vring_packed_desc *desc;
> > +	unsigned int i, j;
> > +
> > +	/* Clear data ptr. */
> > +	vq->desc_state[head].data = NULL;
> > +
> > +	i = head;
> > +
> > +	for (j = 0; j < vq->desc_state[head].num; j++) {
> > +		desc = &vq->vring_packed.desc[i];
> > +		vring_unmap_one(vq, desc);
> > +		i++;
> > +		if (i >= vq->vring_packed.num)
> > +			i = 0;
> > +	}
> > +
> > +	vq->vq.num_free += vq->desc_state[head].num;
> 
> It looks to me vq->free_head grows always, how can we make sure it does not
> exceeds vq.num here?

The vq->free_head stores the index of the next avail
desc. You can find it wraps together with vq->wrap_counter
in virtqueue_add_packed().

> 
> > +
> > +	if (vq->indirect) {
> > +		u32 len;
> > +
> > +		desc = vq->desc_state[head].indir_desc;
> > +		/* Free the indirect table, if any, now that it's unmapped. */
> > +		if (!desc)
> > +			return;
> > +
> > +		len = virtio32_to_cpu(vq->vq.vdev,
> > +				      vq->vring_packed.desc[head].len);
> > +
> > +		BUG_ON(!(vq->vring_packed.desc[head].flags &
> > +			 cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
> > +		BUG_ON(len == 0 || len % sizeof(struct vring_packed_desc));
> > +
> > +		for (j = 0; j < len / sizeof(struct vring_packed_desc); j++)
> > +			vring_unmap_one(vq, &desc[j]);
> > +
> > +		kfree(desc);
> > +		vq->desc_state[head].indir_desc = NULL;
> > +	} else if (ctx) {
> > +		*ctx = vq->desc_state[head].indir_desc;
> > +	}
> > +}
> > +
> > +static inline bool more_used_split(const struct vring_virtqueue *vq)
> >   {
> >   	return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
> >   }
> > -/**
> > - * virtqueue_get_buf - get the next used buffer
> > - * @vq: the struct virtqueue we're talking about.
> > - * @len: the length written into the buffer
> > - *
> > - * If the device wrote data into the buffer, @len will be set to the
> > - * amount written.  This means you don't need to clear the buffer
> > - * beforehand to ensure there's no data leakage in the case of short
> > - * writes.
> > - *
> > - * Caller must ensure we don't call this with other virtqueue
> > - * operations at the same time (except where noted).
> > - *
> > - * Returns NULL if there are no used buffers, or the "data" token
> > - * handed to virtqueue_add_*().
> > - */
> > -void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
> > -			    void **ctx)
> > +static inline bool more_used_packed(const struct vring_virtqueue *vq)
> > +{
> > +	u16 last_used, flags;
> > +	bool avail, used;
> > +
> > +	if (vq->vq.num_free == vq->vring.num)
> > +		return false;
> > +
> > +	last_used = vq->last_used_idx;
> > +	flags = virtio16_to_cpu(vq->vq.vdev,
> > +				vq->vring_packed.desc[last_used].flags);
> > +	avail = flags & VRING_DESC_F_AVAIL(1);
> > +	used = flags & VRING_DESC_F_USED(1);
> > +
> > +	return avail == used;
> > +}
> > +
> > +static inline bool more_used(const struct vring_virtqueue *vq)
> > +{
> > +	return vq->packed ? more_used_packed(vq) : more_used_split(vq);
> > +}
> > +
> > +void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, unsigned int *len,
> > +				  void **ctx)
> >   {
> >   	struct vring_virtqueue *vq = to_vvq(_vq);
> >   	void *ret;
> > @@ -735,9 +1079,9 @@ void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
> >   		return NULL;
> >   	}
> > -	/* detach_buf clears data, so grab it now. */
> > +	/* detach_buf_split clears data, so grab it now. */
> >   	ret = vq->desc_state[i].data;
> > -	detach_buf(vq, i, ctx);
> > +	detach_buf_split(vq, i, ctx);
> >   	vq->last_used_idx++;
> >   	/* If we expect an interrupt for the next entry, tell host
> >   	 * by writing event index and flush out the write before
> > @@ -754,6 +1098,87 @@ void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
> >   	END_USE(vq);
> >   	return ret;
> >   }
> > +
> > +void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq, unsigned int *len,
> > +				   void **ctx)
> > +{
> > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > +	void *ret;
> > +	unsigned int i;
> > +	u16 last_used;
> > +
> > +	START_USE(vq);
> > +
> > +	if (unlikely(vq->broken)) {
> > +		END_USE(vq);
> > +		return NULL;
> > +	}
> > +
> > +	if (!more_used(vq)) {
> > +		pr_debug("No more buffers in queue\n");
> > +		END_USE(vq);
> > +		return NULL;
> > +	}
> > +
> > +	/* Only get used array entries after they have been exposed by host. */
> > +	virtio_rmb(vq->weak_barriers);
> > +
> > +	last_used = vq->last_used_idx;
> > +
> > +	i = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].id);
> > +	*len = virtio32_to_cpu(_vq->vdev, vq->vring_packed.desc[last_used].len);
> > +
> > +	if (unlikely(i >= vq->vring_packed.num)) {
> > +		BAD_RING(vq, "id %u out of range\n", i);
> > +		return NULL;
> > +	}
> > +	if (unlikely(!vq->desc_state[i].data)) {
> > +		BAD_RING(vq, "id %u is not a head!\n", i);
> > +		return NULL;
> > +	}
> > +
> > +	/* detach_buf_packed clears data, so grab it now. */
> > +	ret = vq->desc_state[i].data;
> > +	detach_buf_packed(vq, i, ctx);
> > +
> > +	vq->last_used_idx += vq->desc_state[i].num;
> > +	if (vq->last_used_idx >= vq->vring_packed.num)
> > +		vq->last_used_idx %= vq->vring_packed.num;
> 
> '-=' should be sufficient here?

Good suggestion. I think so.

> 
> > +
> > +	// FIXME: implement the desc event support
> > +
> > +#ifdef DEBUG
> > +	vq->last_add_time_valid = false;
> > +#endif
> > +
> > +	END_USE(vq);
> > +	return ret;
> > +}
> > +
> > +/**
> > + * virtqueue_get_buf - get the next used buffer
> > + * @vq: the struct virtqueue we're talking about.
> > + * @len: the length written into the buffer
> > + *
> > + * If the device wrote data into the buffer, @len will be set to the
> > + * amount written.  This means you don't need to clear the buffer
> > + * beforehand to ensure there's no data leakage in the case of short
> > + * writes.
> > + *
> > + * Caller must ensure we don't call this with other virtqueue
> > + * operations at the same time (except where noted).
> > + *
> > + * Returns NULL if there are no used buffers, or the "data" token
> > + * handed to virtqueue_add_*().
> > + */
> > +void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len,
> > +			    void **ctx)
> > +{
> > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > +
> > +	return vq->packed ? virtqueue_get_buf_ctx_packed(_vq, len, ctx) :
> > +			    virtqueue_get_buf_ctx_split(_vq, len, ctx);
> > +}
> >   EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx);
> >   void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
> > @@ -761,6 +1186,24 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
> >   	return virtqueue_get_buf_ctx(_vq, len, NULL);
> >   }
> >   EXPORT_SYMBOL_GPL(virtqueue_get_buf);
> > +
> > +static void virtqueue_disable_cb_split(struct virtqueue *_vq)
> > +{
> > +	struct vring_virtqueue *vq = to_vvq(_vq);
> > +
> > +	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
> > +		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> > +		if (!vq->event)
> > +			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev,
> > +							vq->avail_flags_shadow);
> > +	}
> > +}
> > +
> > +static void virtqueue_disable_cb_packed(struct virtqueue *_vq)
> > +{
> > +	// FIXME: to be implemented
> > +}
> > +
> >   /**
> >    * virtqueue_disable_cb - disable callbacks
> >    * @vq: the struct virtqueue we're talking about.
> > @@ -774,12 +1217,10 @@ void virtqueue_disable_cb(struct virtqueue *_vq)
> >   {
> >   	struct vring_virtqueue *vq = to_vvq(_vq);
> > -	if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
> > -		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> > -		if (!vq->event)
> > -			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> > -	}
> > -
> > +	if (vq->packed)
> > +		virtqueue_disable_cb_packed(_vq);
> > +	else
> > +		virtqueue_disable_cb_split(_vq);
> >   }
> >   EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
> > @@ -802,6 +1243,12 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
> >   	START_USE(vq);
> > +	if (vq->packed) {
> > +		// FIXME: to be implemented
> > +		last_used_idx = vq->last_used_idx;
> > +		goto out;
> > +	}
> > +
> >   	/* We optimistically turn back on interrupts, then check if there was
> >   	 * more to do. */
> >   	/* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
> > @@ -813,6 +1260,7 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
> >   			vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
> >   	}
> >   	vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
> > +out:
> >   	END_USE(vq);
> >   	return last_used_idx;
> >   }
> > @@ -832,6 +1280,12 @@ bool virtqueue_poll(struct virtqueue *_vq, unsigned last_used_idx)
> >   	struct vring_virtqueue *vq = to_vvq(_vq);
> >   	virtio_mb(vq->weak_barriers);
> > +	if (vq->packed) {
> > +		u16 flags = virtio16_to_cpu(vq->vq.vdev,
> > +				vq->vring_packed.desc[last_used_idx].flags);
> > +		return !(flags & VRING_DESC_F_AVAIL(1)) ==
> > +		       !(flags & VRING_DESC_F_USED(1));
> > +	}
> >   	return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
> >   }
> >   EXPORT_SYMBOL_GPL(virtqueue_poll);
> > @@ -874,6 +1328,11 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
> >   	START_USE(vq);
> > +	if (vq->packed) {
> > +		// FIXME: to be implemented
> > +		goto out;
> > +	}
> > +
> >   	/* We optimistically turn back on interrupts, then check if there was
> >   	 * more to do. */
> >   	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> > @@ -896,6 +1355,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
> >   		return false;
> >   	}
> > +out:
> >   	END_USE(vq);
> >   	return true;
> >   }
> > @@ -922,14 +1382,20 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
> >   			continue;
> >   		/* detach_buf clears data, so grab it now. */
> >   		buf = vq->desc_state[i].data;
> > -		detach_buf(vq, i, NULL);
> > -		vq->avail_idx_shadow--;
> > -		vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
> > +		if (vq->packed)
> > +			detach_buf_packed(vq, i, NULL);
> > +		else {
> > +			detach_buf_split(vq, i, NULL);
> > +			vq->avail_idx_shadow--;
> > +			vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev,
> > +							vq->avail_idx_shadow);
> > +		}
> >   		END_USE(vq);
> >   		return buf;
> >   	}
> >   	/* That should have freed everything. */
> > -	BUG_ON(vq->vq.num_free != vq->vring.num);
> > +	BUG_ON(vq->vq.num_free != (vq->packed ? vq->vring_packed.num :
> > +						vq->vring.num));
> >   	END_USE(vq);
> >   	return NULL;
> > @@ -957,7 +1423,8 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
> >   EXPORT_SYMBOL_GPL(vring_interrupt);
> >   struct virtqueue *__vring_new_virtqueue(unsigned int index,
> > -					struct vring vring,
> > +					union vring_union vring,
> > +					bool packed,
> >   					struct virtio_device *vdev,
> >   					bool weak_barriers,
> >   					bool context,
> > @@ -965,19 +1432,20 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> >   					void (*callback)(struct virtqueue *),
> >   					const char *name)
> >   {
> > -	unsigned int i;
> > +	unsigned int num, i;
> >   	struct vring_virtqueue *vq;
> > -	vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state),
> > +	num = packed ? vring.vring_packed.num : vring.vring_split.num;
> > +
> > +	vq = kmalloc(sizeof(*vq) + num * sizeof(struct vring_desc_state),
> >   		     GFP_KERNEL);
> >   	if (!vq)
> >   		return NULL;
> > -	vq->vring = vring;
> >   	vq->vq.callback = callback;
> >   	vq->vq.vdev = vdev;
> >   	vq->vq.name = name;
> > -	vq->vq.num_free = vring.num;
> > +	vq->vq.num_free = num;
> >   	vq->vq.index = index;
> >   	vq->we_own_ring = false;
> >   	vq->queue_dma_addr = 0;
> > @@ -986,9 +1454,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> >   	vq->weak_barriers = weak_barriers;
> >   	vq->broken = false;
> >   	vq->last_used_idx = 0;
> > -	vq->avail_flags_shadow = 0;
> > -	vq->avail_idx_shadow = 0;
> >   	vq->num_added = 0;
> > +	vq->packed = packed;
> >   	list_add_tail(&vq->vq.list, &vdev->vqs);
> >   #ifdef DEBUG
> >   	vq->in_use = false;
> > @@ -999,18 +1466,41 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
> >   		!context;
> >   	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
> > +	if (vq->packed) {
> > +		vq->vring_packed = vring.vring_packed;
> > +		vq->free_head = 0;
> > +		vq->wrap_counter = 1;
> > +
> > +#if 0
> > +		vq->chaining = virtio_has_feature(vdev,
> > +						  VIRTIO_RING_F_LIST_DESC);
> > +#else
> > +		vq->chaining = true;
> 
> Looks like in V10 there's no F_LIST_DESC.

Yes. I kept this in this patch just because the
desc chaining is optional in the old spec draft
when sending out this patch set. I'll remove it
in next version.

> 
> > +#endif
> > +	} else {
> > +		vq->vring = vring.vring_split;
> > +		vq->avail_flags_shadow = 0;
> > +		vq->avail_idx_shadow = 0;
> > +
> > +		/* Put everything in free lists. */
> > +		vq->free_head = 0;
> > +		for (i = 0; i < num-1; i++)
> > +			vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
> > +	}
> > +
> >   	/* No callback?  Tell other side not to bother us. */
> >   	if (!callback) {
> > -		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> > -		if (!vq->event)
> > -			vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
> > +		if (packed) {
> > +			// FIXME: to be implemented
> > +		} else {
> > +			vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
> > +			if (!vq->event)
> > +				vq->vring.avail->flags = cpu_to_virtio16(vdev,
> > +						vq->avail_flags_shadow);
> > +		}
> >   	}
> > -	/* Put everything in free lists. */
> > -	vq->free_head = 0;
> > -	for (i = 0; i < vring.num-1; i++)
> > -		vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
> > -	memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
> > +	memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
> >   	return &vq->vq;
> >   }
> > @@ -1058,6 +1548,14 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size,
> >   	}
> >   }
> > +static inline int
> > +__vring_size(unsigned int num, unsigned long align, bool packed)
> > +{
> > +	if (packed)
> > +		return vring_packed_size(num, align);
> > +	return vring_size(num, align);
> > +}
> > +
> >   struct virtqueue *vring_create_virtqueue(
> >   	unsigned int index,
> >   	unsigned int num,
> > @@ -1074,7 +1572,8 @@ struct virtqueue *vring_create_virtqueue(
> >   	void *queue = NULL;
> >   	dma_addr_t dma_addr;
> >   	size_t queue_size_in_bytes;
> > -	struct vring vring;
> > +	union vring_union vring;
> > +	bool packed;
> >   	/* We assume num is a power of 2. */
> >   	if (num & (num - 1)) {
> > @@ -1082,9 +1581,13 @@ struct virtqueue *vring_create_virtqueue(
> >   		return NULL;
> >   	}
> > +	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
> > +
> >   	/* TODO: allocate each queue chunk individually */
> > -	for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
> > -		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
> > +	for (; num && __vring_size(num, vring_align, packed) > PAGE_SIZE;
> > +			num /= 2) {
> > +		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
> > +							     packed),
> >   					  &dma_addr,
> >   					  GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
> >   		if (queue)
> > @@ -1096,17 +1599,21 @@ struct virtqueue *vring_create_virtqueue(
> >   	if (!queue) {
> >   		/* Try to get a single page. You are my only hope! */
> > -		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
> > +		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
> > +							     packed),
> >   					  &dma_addr, GFP_KERNEL|__GFP_ZERO);
> >   	}
> >   	if (!queue)
> >   		return NULL;
> > -	queue_size_in_bytes = vring_size(num, vring_align);
> > -	vring_init(&vring, num, queue, vring_align);
> > +	queue_size_in_bytes = __vring_size(num, vring_align, packed);
> > +	if (packed)
> > +		vring_packed_init(&vring.vring_packed, num, queue, vring_align);
> > +	else
> > +		vring_init(&vring.vring_split, num, queue, vring_align);
> 
> Let's rename vring_init to vring_init_split() like other helpers?

The vring_init() is a public API in include/uapi/linux/virtio_ring.h.
I don't think we can rename it.

> 
> > -	vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
> > -				   notify, callback, name);
> > +	vq = __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
> > +				   context, notify, callback, name);
> >   	if (!vq) {
> >   		vring_free_queue(vdev, queue_size_in_bytes, queue,
> >   				 dma_addr);
> > @@ -1132,10 +1639,17 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
> >   				      void (*callback)(struct virtqueue *vq),
> >   				      const char *name)
> >   {
> > -	struct vring vring;
> > -	vring_init(&vring, num, pages, vring_align);
> > -	return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
> > -				     notify, callback, name);
> > +	union vring_union vring;
> > +	bool packed;
> > +
> > +	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
> > +	if (packed)
> > +		vring_packed_init(&vring.vring_packed, num, pages, vring_align);
> > +	else
> > +		vring_init(&vring.vring_split, num, pages, vring_align);
> > +
> > +	return __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
> > +				     context, notify, callback, name);
> >   }
> >   EXPORT_SYMBOL_GPL(vring_new_virtqueue);
> > @@ -1145,7 +1659,9 @@ void vring_del_virtqueue(struct virtqueue *_vq)
> >   	if (vq->we_own_ring) {
> >   		vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
> > -				 vq->vring.desc, vq->queue_dma_addr);
> > +				 vq->packed ? (void *)vq->vring_packed.desc :
> > +					      (void *)vq->vring.desc,
> > +				 vq->queue_dma_addr);
> >   	}
> >   	list_del(&_vq->list);
> >   	kfree(vq);
> > @@ -1159,14 +1675,18 @@ void vring_transport_features(struct virtio_device *vdev)
> >   	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
> >   		switch (i) {
> > +#if 0 // FIXME: to be implemented
> >   		case VIRTIO_RING_F_INDIRECT_DESC:
> >   			break;
> > +#endif
> >   		case VIRTIO_RING_F_EVENT_IDX:
> >   			break;
> >   		case VIRTIO_F_VERSION_1:
> >   			break;
> >   		case VIRTIO_F_IOMMU_PLATFORM:
> >   			break;
> > +		case VIRTIO_F_RING_PACKED:
> > +			break;
> >   		default:
> >   			/* We don't understand this bit. */
> >   			__virtio_clear_bit(vdev, i);
> > @@ -1187,7 +1707,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
> >   	struct vring_virtqueue *vq = to_vvq(_vq);
> > -	return vq->vring.num;
> > +	return vq->packed ? vq->vring_packed.num : vq->vring.num;
> >   }
> >   EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
> > @@ -1224,6 +1744,7 @@ dma_addr_t virtqueue_get_desc_addr(struct virtqueue *_vq)
> >   }
> >   EXPORT_SYMBOL_GPL(virtqueue_get_desc_addr);
> > +/* Only available for split ring */
> 
> Interesting, I think we need this for correctly configure pci. e.g in
> setup_vq()?

Yes. The setup_vq() should be updated. But it requires
QEMU change, so I just kept it as is in this RFC patch.

> 
> >   dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
> >   {
> >   	struct vring_virtqueue *vq = to_vvq(_vq);
> > @@ -1235,6 +1756,7 @@ dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
> >   }
> >   EXPORT_SYMBOL_GPL(virtqueue_get_avail_addr);
> > +/* Only available for split ring */
> >   dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
> 
> Maybe it's better to rename this to get_device_addr().

It's a kernel API which has been exported by EXPORT_SYMBOL_GPL(),
I'm not sure whether it's a good idea to rename it.

Best regards,
Tiwei Bie

> 
> Thanks
> 
> >   {
> >   	struct vring_virtqueue *vq = to_vvq(_vq);
> > @@ -1246,6 +1768,7 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
> >   }
> >   EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
> > +/* Only available for split ring */
> >   const struct vring *virtqueue_get_vring(struct virtqueue *vq)
> >   {
> >   	return &to_vvq(vq)->vring;
> > diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
> > index bbf32524ab27..a0075894ad16 100644
> > --- a/include/linux/virtio_ring.h
> > +++ b/include/linux/virtio_ring.h
> > @@ -60,6 +60,11 @@ static inline void virtio_store_mb(bool weak_barriers,
> >   struct virtio_device;
> >   struct virtqueue;
> > +union vring_union {
> > +	struct vring vring_split;
> > +	struct vring_packed vring_packed;
> > +};
> > +
> >   /*
> >    * Creates a virtqueue and allocates the descriptor ring.  If
> >    * may_reduce_num is set, then this may allocate a smaller ring than
> > @@ -79,7 +84,8 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
> >   /* Creates a virtqueue with a custom layout. */
> >   struct virtqueue *__vring_new_virtqueue(unsigned int index,
> > -					struct vring vring,
> > +					union vring_union vring,
> > +					bool packed,
> >   					struct virtio_device *vdev,
> >   					bool weak_barriers,
> >   					bool ctx,
> 
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH RFC 2/2] virtio_ring: support packed ring
From: Jason Wang @ 2018-03-16  6:44 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: mst, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <20180316061047.o2xdyuqhak3mzjyw@debian>



On 2018年03月16日 14:10, Tiwei Bie wrote:
> On Fri, Mar 16, 2018 at 12:03:25PM +0800, Jason Wang wrote:
>> On 2018年02月23日 19:18, Tiwei Bie wrote:
>>> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
>>> ---
>>>    drivers/virtio/virtio_ring.c | 699 +++++++++++++++++++++++++++++++++++++------
>>>    include/linux/virtio_ring.h  |   8 +-
>>>    2 files changed, 618 insertions(+), 89 deletions(-)
>>>
>>> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
>>> index eb30f3e09a47..393778a2f809 100644
>>> --- a/drivers/virtio/virtio_ring.c
>>> +++ b/drivers/virtio/virtio_ring.c
>>> @@ -58,14 +58,14 @@
>>>    struct vring_desc_state {
>>>    	void *data;			/* Data for callback. */
>>> -	struct vring_desc *indir_desc;	/* Indirect descriptor, if any. */
>>> +	void *indir_desc;		/* Indirect descriptor, if any. */
>>> +	int num;			/* Descriptor list length. */
>>>    };
>>>    struct vring_virtqueue {
>>>    	struct virtqueue vq;
>>> -	/* Actual memory layout for this queue */
>>> -	struct vring vring;
>>> +	bool packed;
>>>    	/* Can we use weak barriers? */
>>>    	bool weak_barriers;
>>> @@ -87,11 +87,28 @@ struct vring_virtqueue {
>>>    	/* Last used index we've seen. */
>>>    	u16 last_used_idx;
>>> -	/* Last written value to avail->flags */
>>> -	u16 avail_flags_shadow;
>>> -
>>> -	/* Last written value to avail->idx in guest byte order */
>>> -	u16 avail_idx_shadow;
>>> +	union {
>>> +		/* Available for split ring */
>>> +		struct {
>>> +			/* Actual memory layout for this queue */
>>> +			struct vring vring;
>>> +
>>> +			/* Last written value to avail->flags */
>>> +			u16 avail_flags_shadow;
>>> +
>>> +			/* Last written value to avail->idx in
>>> +			 * guest byte order */
>>> +			u16 avail_idx_shadow;
>>> +		};
>>> +
>>> +		/* Available for packed ring */
>>> +		struct {
>>> +			/* Actual memory layout for this queue */
>>> +			struct vring_packed vring_packed;
>>> +			u8 wrap_counter : 1;
>>> +			bool chaining;
>>> +		};
>>> +	};
>>>    	/* How to notify other side. FIXME: commonalize hcalls! */
>>>    	bool (*notify)(struct virtqueue *vq);
>>> @@ -201,26 +218,37 @@ static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
>>>    			      cpu_addr, size, direction);
>>>    }
>>> -static void vring_unmap_one(const struct vring_virtqueue *vq,
>>> -			    struct vring_desc *desc)
>>> +static void vring_unmap_one(const struct vring_virtqueue *vq, void *_desc)
>>>    {
>> Let's split the helpers to packed/split version like other helpers?
>> (Consider the caller has already known the type of vq).
> Okay.
>

[...]

>>> +				desc[i].flags = flags;
>>> +
>>> +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
>>> +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
>>> +			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
>> If it's a part of chain, we only need to do this for last buffer I think.
> I'm not sure I've got your point about the "last buffer".
> But, yes, id just needs to be set for the last desc.

Right, I think I meant "last descriptor" :)

>
>>> +			prev = i;
>>> +			i++;
>> It looks to me prev is always i - 1?
> No. prev will be (vq->vring_packed.num - 1) when i becomes 0.

Right, so prev = i ? i - 1 : vq->vring_packed.num - 1.

>
>>> +			if (!indirect && i >= vq->vring_packed.num) {
>>> +				i = 0;
>>> +				vq->wrap_counter ^= 1;
>>> +			}
>>> +		}
>>> +	}
>>> +	for (; n < (out_sgs + in_sgs); n++) {
>>> +		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
>>> +			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
>>> +			if (vring_mapping_error(vq, addr))
>>> +				goto unmap_release;
>>> +
>>> +			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
>>> +					VRING_DESC_F_WRITE |
>>> +					VRING_DESC_F_AVAIL(vq->wrap_counter) |
>>> +					VRING_DESC_F_USED(!vq->wrap_counter));
>>> +			if (!indirect && i == head)
>>> +				head_flags = flags;
>>> +			else
>>> +				desc[i].flags = flags;
>>> +
>>> +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
>>> +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
>>> +			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
>>> +			prev = i;
>>> +			i++;
>>> +			if (!indirect && i >= vq->vring_packed.num) {
>>> +				i = 0;
>>> +				vq->wrap_counter ^= 1;
>>> +			}
>>> +		}
>>> +	}
>>> +	/* Last one doesn't continue. */
>>> +	if (!indirect && (head + 1) % vq->vring_packed.num == i)
>>> +		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
>> I can't get the why we need this here.
> If only one desc is used, we will need to clear the
> VRING_DESC_F_NEXT flag from the head_flags.

Yes, I meant why following desc[prev].flags won't work for this?

>
>>> +	else
>>> +		desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
>>> +
>>> +	if (indirect) {
>>> +		/* FIXME: to be implemented */
>>> +
>>> +		/* Now that the indirect table is filled in, map it. */
>>> +		dma_addr_t addr = vring_map_single(
>>> +			vq, desc, total_sg * sizeof(struct vring_packed_desc),
>>> +			DMA_TO_DEVICE);
>>> +		if (vring_mapping_error(vq, addr))
>>> +			goto unmap_release;
>>> +
>>> +		head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
>>> +					     VRING_DESC_F_AVAIL(wrap_counter) |
>>> +					     VRING_DESC_F_USED(!wrap_counter));
>>> +		vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
>>> +		vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
>>> +				total_sg * sizeof(struct vring_packed_desc));
>>> +		vq->vring_packed.desc[head].id = cpu_to_virtio32(_vq->vdev, head);
>>> +	}
>>> +
>>> +	/* We're using some buffers from the free list. */
>>> +	vq->vq.num_free -= descs_used;
>>> +
>>> +	/* Update free pointer */
>>> +	if (indirect) {
>>> +		n = head + 1;
>>> +		if (n >= vq->vring_packed.num) {
>>> +			n = 0;
>>> +			vq->wrap_counter ^= 1;
>>> +		}
>>> +		vq->free_head = n;
>> detach_buf_packed() does not even touch free_head here, so need to explain
>> its meaning for packed ring.
> Above code is for indirect support which isn't really
> implemented in this patch yet.
>
> For your question, free_head stores the index of the
> next avail desc. I'll add a comment for it or move it
> to union and give it a better name in next version.

Yes, something like avail_idx might be better.

>
>>> +	} else
>>> +		vq->free_head = i;
>> ID is only valid in the last descriptor in the list, so head + 1 should be
>> ok too?
> I don't really get your point. The vq->free_head stores
> the index of the next avail desc.

I think I get your idea now, free_head has two meanings:

- next avail index
- buffer id

If I'm correct, let's better add a comment for this.

>
>>> +
>>> +	/* Store token and indirect buffer state. */
>>> +	vq->desc_state[head].num = descs_used;
>>> +	vq->desc_state[head].data = data;
>>> +	if (indirect)
>>> +		vq->desc_state[head].indir_desc = desc;
>>> +	else
>>> +		vq->desc_state[head].indir_desc = ctx;
>>> +
>>> +	virtio_wmb(vq->weak_barriers);
>> Let's add a comment to explain the barrier here.
> Okay.
>
>>> +	vq->vring_packed.desc[head].flags = head_flags;
>>> +	vq->num_added++;
>>> +
>>> +	pr_debug("Added buffer head %i to %p\n", head, vq);
>>> +	END_USE(vq);
>>> +
>>> +	return 0;
>>> +
>>> +unmap_release:
>>> +	err_idx = i;
>>> +	i = head;
>>> +
>>> +	for (n = 0; n < total_sg; n++) {
>>> +		if (i == err_idx)
>>> +			break;
>>> +		vring_unmap_one(vq, &desc[i]);
>>> +		i++;
>>> +		if (!indirect && i >= vq->vring_packed.num)
>>> +			i = 0;
>>> +	}
>>> +
>>> +	vq->wrap_counter = wrap_counter;
>>> +
>>> +	if (indirect)
>>> +		kfree(desc);
>>> +
>>> +	END_USE(vq);
>>> +	return -EIO;
>>> +}
>>> +
>>> +static inline int virtqueue_add(struct virtqueue *_vq,
>>> +				struct scatterlist *sgs[],
>>> +				unsigned int total_sg,
>>> +				unsigned int out_sgs,
>>> +				unsigned int in_sgs,
>>> +				void *data,
>>> +				void *ctx,
>>> +				gfp_t gfp)
>>> +{
>>> +	struct vring_virtqueue *vq = to_vvq(_vq);
>>> +
>>> +	return vq->packed ? virtqueue_add_packed(_vq, sgs, total_sg, out_sgs,
>>> +						 in_sgs, data, ctx, gfp) :
>>> +			    virtqueue_add_split(_vq, sgs, total_sg, out_sgs,
>>> +						in_sgs, data, ctx, gfp);
>>> +}
>>> +
>>>    /**
>>>     * virtqueue_add_sgs - expose buffers to other end
>>>     * @vq: the struct virtqueue we're talking about.
>>> @@ -561,6 +845,12 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
>>>    	 * event. */
>>>    	virtio_mb(vq->weak_barriers);
>>> +	if (vq->packed) {
>>> +		/* FIXME: to be implemented */
>>> +		needs_kick = true;
>>> +		goto out;
>>> +	}
>>> +
>>>    	old = vq->avail_idx_shadow - vq->num_added;
>>>    	new = vq->avail_idx_shadow;
>>>    	vq->num_added = 0;
>>> @@ -579,6 +869,8 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
>>>    	} else {
>>>    		needs_kick = !(vq->vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY));
>>>    	}
>>> +
>>> +out:
>>>    	END_USE(vq);
>>>    	return needs_kick;
>>>    }
>>> @@ -628,8 +920,8 @@ bool virtqueue_kick(struct virtqueue *vq)
>>>    }
>>>    EXPORT_SYMBOL_GPL(virtqueue_kick);
>>> -static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
>>> -		       void **ctx)
>>> +static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
>>> +			     void **ctx)
>>>    {
>>>    	unsigned int i, j;
>>>    	__virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
>>> @@ -677,29 +969,81 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
>>>    	}
>>>    }
>>> -static inline bool more_used(const struct vring_virtqueue *vq)
>>> +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
>>> +			      void **ctx)
>>> +{
>>> +	struct vring_packed_desc *desc;
>>> +	unsigned int i, j;
>>> +
>>> +	/* Clear data ptr. */
>>> +	vq->desc_state[head].data = NULL;
>>> +
>>> +	i = head;
>>> +
>>> +	for (j = 0; j < vq->desc_state[head].num; j++) {
>>> +		desc = &vq->vring_packed.desc[i];
>>> +		vring_unmap_one(vq, desc);
>>> +		i++;
>>> +		if (i >= vq->vring_packed.num)
>>> +			i = 0;
>>> +	}
>>> +
>>> +	vq->vq.num_free += vq->desc_state[head].num;
>> It looks to me vq->free_head grows always, how can we make sure it does not
>> exceeds vq.num here?
> The vq->free_head stores the index of the next avail
> desc. You can find it wraps together with vq->wrap_counter
> in virtqueue_add_packed().
>

I see, thanks.


[...]

>>> +
>>> +	/* detach_buf_packed clears data, so grab it now. */
>>> +	ret = vq->desc_state[i].data;
>>> +	detach_buf_packed(vq, i, ctx);
>>> +
>>> +	vq->last_used_idx += vq->desc_state[i].num;
>>> +	if (vq->last_used_idx >= vq->vring_packed.num)
>>> +		vq->last_used_idx %= vq->vring_packed.num;
>> '-=' should be sufficient here?
> Good suggestion. I think so.
>
>>> +
>>> +	// FIXME: implement the desc event support
>>> +
>>> +#ifdef DEBUG
>>> +	vq->last_add_time_valid = false;
>>> +#endif
>>> +
>>> +	END_USE(vq);
>>> +	return ret;
>>> +}
>>> +

[...]

>>> +
>>> +#if 0
>>> +		vq->chaining = virtio_has_feature(vdev,
>>> +						  VIRTIO_RING_F_LIST_DESC);
>>> +#else
>>> +		vq->chaining = true;
>> Looks like in V10 there's no F_LIST_DESC.
> Yes. I kept this in this patch just because the
> desc chaining is optional in the old spec draft
> when sending out this patch set. I'll remove it
> in next version.
>
>>> +#endif
>>> +	} else {
>>> +		vq->vring = vring.vring_split;
>>> +		vq->avail_flags_shadow = 0;
>>> +		vq->avail_idx_shadow = 0;
>>> +
>>> +		/* Put everything in free lists. */
>>> +		vq->free_head = 0;
>>> +		for (i = 0; i < num-1; i++)
>>> +			vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
>>> +	}
>>> +
>>>    	/* No callback?  Tell other side not to bother us. */
>>>    	if (!callback) {
>>> -		vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
>>> -		if (!vq->event)
>>> -			vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
>>> +		if (packed) {
>>> +			// FIXME: to be implemented
>>> +		} else {
>>> +			vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
>>> +			if (!vq->event)
>>> +				vq->vring.avail->flags = cpu_to_virtio16(vdev,
>>> +						vq->avail_flags_shadow);
>>> +		}
>>>    	}
>>> -	/* Put everything in free lists. */
>>> -	vq->free_head = 0;
>>> -	for (i = 0; i < vring.num-1; i++)
>>> -		vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
>>> -	memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
>>> +	memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
>>>    	return &vq->vq;
>>>    }
>>> @@ -1058,6 +1548,14 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size,
>>>    	}
>>>    }
>>> +static inline int
>>> +__vring_size(unsigned int num, unsigned long align, bool packed)
>>> +{
>>> +	if (packed)
>>> +		return vring_packed_size(num, align);
>>> +	return vring_size(num, align);
>>> +}
>>> +
>>>    struct virtqueue *vring_create_virtqueue(
>>>    	unsigned int index,
>>>    	unsigned int num,
>>> @@ -1074,7 +1572,8 @@ struct virtqueue *vring_create_virtqueue(
>>>    	void *queue = NULL;
>>>    	dma_addr_t dma_addr;
>>>    	size_t queue_size_in_bytes;
>>> -	struct vring vring;
>>> +	union vring_union vring;
>>> +	bool packed;
>>>    	/* We assume num is a power of 2. */
>>>    	if (num & (num - 1)) {
>>> @@ -1082,9 +1581,13 @@ struct virtqueue *vring_create_virtqueue(
>>>    		return NULL;
>>>    	}
>>> +	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
>>> +
>>>    	/* TODO: allocate each queue chunk individually */
>>> -	for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
>>> -		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
>>> +	for (; num && __vring_size(num, vring_align, packed) > PAGE_SIZE;
>>> +			num /= 2) {
>>> +		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
>>> +							     packed),
>>>    					  &dma_addr,
>>>    					  GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
>>>    		if (queue)
>>> @@ -1096,17 +1599,21 @@ struct virtqueue *vring_create_virtqueue(
>>>    	if (!queue) {
>>>    		/* Try to get a single page. You are my only hope! */
>>> -		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
>>> +		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
>>> +							     packed),
>>>    					  &dma_addr, GFP_KERNEL|__GFP_ZERO);
>>>    	}
>>>    	if (!queue)
>>>    		return NULL;
>>> -	queue_size_in_bytes = vring_size(num, vring_align);
>>> -	vring_init(&vring, num, queue, vring_align);
>>> +	queue_size_in_bytes = __vring_size(num, vring_align, packed);
>>> +	if (packed)
>>> +		vring_packed_init(&vring.vring_packed, num, queue, vring_align);
>>> +	else
>>> +		vring_init(&vring.vring_split, num, queue, vring_align);
>> Let's rename vring_init to vring_init_split() like other helpers?
> The vring_init() is a public API in include/uapi/linux/virtio_ring.h.
> I don't think we can rename it.

I see, then this need more thoughts to unify the API.

>
>>> -	vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
>>> -				   notify, callback, name);
>>> +	vq = __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
>>> +				   context, notify, callback, name);
>>>    	if (!vq) {
>>>    		vring_free_queue(vdev, queue_size_in_bytes, queue,
>>>    				 dma_addr);
>>> @@ -1132,10 +1639,17 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
>>>    				      void (*callback)(struct virtqueue *vq),
>>>    				      const char *name)
>>>    {
>>> -	struct vring vring;
>>> -	vring_init(&vring, num, pages, vring_align);
>>> -	return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
>>> -				     notify, callback, name);
>>> +	union vring_union vring;
>>> +	bool packed;
>>> +
>>> +	packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
>>> +	if (packed)
>>> +		vring_packed_init(&vring.vring_packed, num, pages, vring_align);
>>> +	else
>>> +		vring_init(&vring.vring_split, num, pages, vring_align);
>>> +
>>> +	return __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
>>> +				     context, notify, callback, name);
>>>    }
>>>    EXPORT_SYMBOL_GPL(vring_new_virtqueue);
>>> @@ -1145,7 +1659,9 @@ void vring_del_virtqueue(struct virtqueue *_vq)
>>>    	if (vq->we_own_ring) {
>>>    		vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
>>> -				 vq->vring.desc, vq->queue_dma_addr);
>>> +				 vq->packed ? (void *)vq->vring_packed.desc :
>>> +					      (void *)vq->vring.desc,
>>> +				 vq->queue_dma_addr);
>>>    	}
>>>    	list_del(&_vq->list);
>>>    	kfree(vq);
>>> @@ -1159,14 +1675,18 @@ void vring_transport_features(struct virtio_device *vdev)
>>>    	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
>>>    		switch (i) {
>>> +#if 0 // FIXME: to be implemented
>>>    		case VIRTIO_RING_F_INDIRECT_DESC:
>>>    			break;
>>> +#endif
>>>    		case VIRTIO_RING_F_EVENT_IDX:
>>>    			break;
>>>    		case VIRTIO_F_VERSION_1:
>>>    			break;
>>>    		case VIRTIO_F_IOMMU_PLATFORM:
>>>    			break;
>>> +		case VIRTIO_F_RING_PACKED:
>>> +			break;
>>>    		default:
>>>    			/* We don't understand this bit. */
>>>    			__virtio_clear_bit(vdev, i);
>>> @@ -1187,7 +1707,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
>>>    	struct vring_virtqueue *vq = to_vvq(_vq);
>>> -	return vq->vring.num;
>>> +	return vq->packed ? vq->vring_packed.num : vq->vring.num;
>>>    }
>>>    EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
>>> @@ -1224,6 +1744,7 @@ dma_addr_t virtqueue_get_desc_addr(struct virtqueue *_vq)
>>>    }
>>>    EXPORT_SYMBOL_GPL(virtqueue_get_desc_addr);
>>> +/* Only available for split ring */
>> Interesting, I think we need this for correctly configure pci. e.g in
>> setup_vq()?
> Yes. The setup_vq() should be updated. But it requires
> QEMU change, so I just kept it as is in this RFC patch.

Ok.

>
>>>    dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
>>>    {
>>>    	struct vring_virtqueue *vq = to_vvq(_vq);
>>> @@ -1235,6 +1756,7 @@ dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
>>>    }
>>>    EXPORT_SYMBOL_GPL(virtqueue_get_avail_addr);
>>> +/* Only available for split ring */
>>>    dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
>> Maybe it's better to rename this to get_device_addr().
> It's a kernel API which has been exported by EXPORT_SYMBOL_GPL(),
> I'm not sure whether it's a good idea to rename it.

If it's not a part of uapi, I think we can.

Thanks

>
> Best regards,
> Tiwei Bie
>
>> Thanks
>>
>>>    {
>>>    	struct vring_virtqueue *vq = to_vvq(_vq);
>>> @@ -1246,6 +1768,7 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
>>>    }
>>>    EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
>>> +/* Only available for split ring */
>>>    const struct vring *virtqueue_get_vring(struct virtqueue *vq)
>>>    {
>>>    	return &to_vvq(vq)->vring;
>>> diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
>>> index bbf32524ab27..a0075894ad16 100644
>>> --- a/include/linux/virtio_ring.h
>>> +++ b/include/linux/virtio_ring.h
>>> @@ -60,6 +60,11 @@ static inline void virtio_store_mb(bool weak_barriers,
>>>    struct virtio_device;
>>>    struct virtqueue;
>>> +union vring_union {
>>> +	struct vring vring_split;
>>> +	struct vring_packed vring_packed;
>>> +};
>>> +
>>>    /*
>>>     * Creates a virtqueue and allocates the descriptor ring.  If
>>>     * may_reduce_num is set, then this may allocate a smaller ring than
>>> @@ -79,7 +84,8 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
>>>    /* Creates a virtqueue with a custom layout. */
>>>    struct virtqueue *__vring_new_virtqueue(unsigned int index,
>>> -					struct vring vring,
>>> +					union vring_union vring,
>>> +					bool packed,
>>>    					struct virtio_device *vdev,
>>>    					bool weak_barriers,
>>>    					bool ctx,

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH RFC 2/2] virtio_ring: support packed ring
From: Tiwei Bie @ 2018-03-16  7:40 UTC (permalink / raw)
  To: Jason Wang; +Cc: mst, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <0a0ecf42-8f7f-9387-8ca4-cb65d0835b56@redhat.com>

On Fri, Mar 16, 2018 at 02:44:12PM +0800, Jason Wang wrote:
> On 2018年03月16日 14:10, Tiwei Bie wrote:
> > On Fri, Mar 16, 2018 at 12:03:25PM +0800, Jason Wang wrote:
> > > On 2018年02月23日 19:18, Tiwei Bie wrote:
> > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > ---
> > > >    drivers/virtio/virtio_ring.c | 699 +++++++++++++++++++++++++++++++++++++------
> > > >    include/linux/virtio_ring.h  |   8 +-
> > > >    2 files changed, 618 insertions(+), 89 deletions(-)
[...]
> > > >    			      cpu_addr, size, direction);
> > > >    }
> > > > -static void vring_unmap_one(const struct vring_virtqueue *vq,
> > > > -			    struct vring_desc *desc)
> > > > +static void vring_unmap_one(const struct vring_virtqueue *vq, void *_desc)
> > > >    {
> > > Let's split the helpers to packed/split version like other helpers?
> > > (Consider the caller has already known the type of vq).
> > Okay.
> > 
> 
> [...]
> 
> > > > +				desc[i].flags = flags;
> > > > +
> > > > +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> > > > +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> > > > +			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
> > > If it's a part of chain, we only need to do this for last buffer I think.
> > I'm not sure I've got your point about the "last buffer".
> > But, yes, id just needs to be set for the last desc.
> 
> Right, I think I meant "last descriptor" :)
> 
> > 
> > > > +			prev = i;
> > > > +			i++;
> > > It looks to me prev is always i - 1?
> > No. prev will be (vq->vring_packed.num - 1) when i becomes 0.
> 
> Right, so prev = i ? i - 1 : vq->vring_packed.num - 1.

Yes, i wraps together with vq->wrap_counter in following code:

> > > > +			if (!indirect && i >= vq->vring_packed.num) {
> > > > +				i = 0;
> > > > +				vq->wrap_counter ^= 1;
> > > > +			}


> > > > +		}
> > > > +	}
> > > > +	for (; n < (out_sgs + in_sgs); n++) {
> > > > +		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > > > +			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
> > > > +			if (vring_mapping_error(vq, addr))
> > > > +				goto unmap_release;
> > > > +
> > > > +			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
> > > > +					VRING_DESC_F_WRITE |
> > > > +					VRING_DESC_F_AVAIL(vq->wrap_counter) |
> > > > +					VRING_DESC_F_USED(!vq->wrap_counter));
> > > > +			if (!indirect && i == head)
> > > > +				head_flags = flags;
> > > > +			else
> > > > +				desc[i].flags = flags;
> > > > +
> > > > +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> > > > +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> > > > +			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
> > > > +			prev = i;
> > > > +			i++;
> > > > +			if (!indirect && i >= vq->vring_packed.num) {
> > > > +				i = 0;
> > > > +				vq->wrap_counter ^= 1;
> > > > +			}
> > > > +		}
> > > > +	}
> > > > +	/* Last one doesn't continue. */
> > > > +	if (!indirect && (head + 1) % vq->vring_packed.num == i)
> > > > +		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> > > I can't get the why we need this here.
> > If only one desc is used, we will need to clear the
> > VRING_DESC_F_NEXT flag from the head_flags.
> 
> Yes, I meant why following desc[prev].flags won't work for this?

Because the update of desc[head].flags (in above case,
prev == head) has been delayed. The flags is saved in
head_flags.

> 
> > 
> > > > +	else
> > > > +		desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> > > > +
> > > > +	if (indirect) {
> > > > +		/* FIXME: to be implemented */
> > > > +
> > > > +		/* Now that the indirect table is filled in, map it. */
> > > > +		dma_addr_t addr = vring_map_single(
> > > > +			vq, desc, total_sg * sizeof(struct vring_packed_desc),
> > > > +			DMA_TO_DEVICE);
> > > > +		if (vring_mapping_error(vq, addr))
> > > > +			goto unmap_release;
> > > > +
> > > > +		head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
> > > > +					     VRING_DESC_F_AVAIL(wrap_counter) |
> > > > +					     VRING_DESC_F_USED(!wrap_counter));
> > > > +		vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
> > > > +		vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
> > > > +				total_sg * sizeof(struct vring_packed_desc));
> > > > +		vq->vring_packed.desc[head].id = cpu_to_virtio32(_vq->vdev, head);
> > > > +	}
> > > > +
> > > > +	/* We're using some buffers from the free list. */
> > > > +	vq->vq.num_free -= descs_used;
> > > > +
> > > > +	/* Update free pointer */
> > > > +	if (indirect) {
> > > > +		n = head + 1;
> > > > +		if (n >= vq->vring_packed.num) {
> > > > +			n = 0;
> > > > +			vq->wrap_counter ^= 1;
> > > > +		}
> > > > +		vq->free_head = n;
> > > detach_buf_packed() does not even touch free_head here, so need to explain
> > > its meaning for packed ring.
> > Above code is for indirect support which isn't really
> > implemented in this patch yet.
> > 
> > For your question, free_head stores the index of the
> > next avail desc. I'll add a comment for it or move it
> > to union and give it a better name in next version.
> 
> Yes, something like avail_idx might be better.
> 
> > 
> > > > +	} else
> > > > +		vq->free_head = i;
> > > ID is only valid in the last descriptor in the list, so head + 1 should be
> > > ok too?
> > I don't really get your point. The vq->free_head stores
> > the index of the next avail desc.
> 
> I think I get your idea now, free_head has two meanings:
> 
> - next avail index
> - buffer id

In my design, free_head is just the index of the next
avail desc.

Driver can set anything to buffer ID. And in my design,
I save desc index in buffer ID.

I'll add comments for them.

> 
> If I'm correct, let's better add a comment for this.
> 
> > 
> > > > +
> > > > +	/* Store token and indirect buffer state. */
> > > > +	vq->desc_state[head].num = descs_used;
> > > > +	vq->desc_state[head].data = data;
> > > > +	if (indirect)
> > > > +		vq->desc_state[head].indir_desc = desc;
> > > > +	else
> > > > +		vq->desc_state[head].indir_desc = ctx;
> > > > +
> > > > +	virtio_wmb(vq->weak_barriers);
> > > Let's add a comment to explain the barrier here.
> > Okay.
> > 
> > > > +	vq->vring_packed.desc[head].flags = head_flags;
> > > > +	vq->num_added++;
> > > > +
> > > > +	pr_debug("Added buffer head %i to %p\n", head, vq);
> > > > +	END_USE(vq);
> > > > +
> > > > +	return 0;
> > > > +
> > > > +unmap_release:
> > > > +	err_idx = i;
> > > > +	i = head;
> > > > +
> > > > +	for (n = 0; n < total_sg; n++) {
> > > > +		if (i == err_idx)
> > > > +			break;
> > > > +		vring_unmap_one(vq, &desc[i]);
> > > > +		i++;
> > > > +		if (!indirect && i >= vq->vring_packed.num)
> > > > +			i = 0;
> > > > +	}
> > > > +
> > > > +	vq->wrap_counter = wrap_counter;
> > > > +
> > > > +	if (indirect)
> > > > +		kfree(desc);
> > > > +
> > > > +	END_USE(vq);
> > > > +	return -EIO;
> > > > +}
[...]
> > > > @@ -1096,17 +1599,21 @@ struct virtqueue *vring_create_virtqueue(
> > > >    	if (!queue) {
> > > >    		/* Try to get a single page. You are my only hope! */
> > > > -		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
> > > > +		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
> > > > +							     packed),
> > > >    					  &dma_addr, GFP_KERNEL|__GFP_ZERO);
> > > >    	}
> > > >    	if (!queue)
> > > >    		return NULL;
> > > > -	queue_size_in_bytes = vring_size(num, vring_align);
> > > > -	vring_init(&vring, num, queue, vring_align);
> > > > +	queue_size_in_bytes = __vring_size(num, vring_align, packed);
> > > > +	if (packed)
> > > > +		vring_packed_init(&vring.vring_packed, num, queue, vring_align);
> > > > +	else
> > > > +		vring_init(&vring.vring_split, num, queue, vring_align);
> > > Let's rename vring_init to vring_init_split() like other helpers?
> > The vring_init() is a public API in include/uapi/linux/virtio_ring.h.
> > I don't think we can rename it.
> 
> I see, then this need more thoughts to unify the API.

My thought is to keep the old API as is, and introduce
new types and helpers for packed ring.

More details can be found in this patch:
https://lkml.org/lkml/2018/2/23/243
(PS. The type which has bit fields is just for reference,
 and will be changed in next version.)

Do you have any other suggestions?

Best regards,
Tiwei Bie

> 
> > 
> > > > -	vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
> > > > -				   notify, callback, name);
> > > > +	vq = __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
> > > > +				   context, notify, callback, name);
> > > >    	if (!vq) {
> > > >    		vring_free_queue(vdev, queue_size_in_bytes, queue,
> > > >    				 dma_addr);
[...]
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH RFC 2/2] virtio_ring: support packed ring
From: Jason Wang @ 2018-03-16  8:34 UTC (permalink / raw)
  To: Tiwei Bie; +Cc: mst, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <20180316074028.lun2jy45qqnfeymw@debian>



On 2018年03月16日 15:40, Tiwei Bie wrote:
> On Fri, Mar 16, 2018 at 02:44:12PM +0800, Jason Wang wrote:
>> On 2018年03月16日 14:10, Tiwei Bie wrote:
>>> On Fri, Mar 16, 2018 at 12:03:25PM +0800, Jason Wang wrote:
>>>> On 2018年02月23日 19:18, Tiwei Bie wrote:
>>>>> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
>>>>> ---
>>>>>     drivers/virtio/virtio_ring.c | 699 +++++++++++++++++++++++++++++++++++++------
>>>>>     include/linux/virtio_ring.h  |   8 +-
>>>>>     2 files changed, 618 insertions(+), 89 deletions(-)
> [...]
>>>>>     			      cpu_addr, size, direction);
>>>>>     }
>>>>> -static void vring_unmap_one(const struct vring_virtqueue *vq,
>>>>> -			    struct vring_desc *desc)
>>>>> +static void vring_unmap_one(const struct vring_virtqueue *vq, void *_desc)
>>>>>     {
>>>> Let's split the helpers to packed/split version like other helpers?
>>>> (Consider the caller has already known the type of vq).
>>> Okay.
>>>
>> [...]
>>
>>>>> +				desc[i].flags = flags;
>>>>> +
>>>>> +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
>>>>> +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
>>>>> +			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
>>>> If it's a part of chain, we only need to do this for last buffer I think.
>>> I'm not sure I've got your point about the "last buffer".
>>> But, yes, id just needs to be set for the last desc.
>> Right, I think I meant "last descriptor" :)
>>
>>>>> +			prev = i;
>>>>> +			i++;
>>>> It looks to me prev is always i - 1?
>>> No. prev will be (vq->vring_packed.num - 1) when i becomes 0.
>> Right, so prev = i ? i - 1 : vq->vring_packed.num - 1.
> Yes, i wraps together with vq->wrap_counter in following code:
>
>>>>> +			if (!indirect && i >= vq->vring_packed.num) {
>>>>> +				i = 0;
>>>>> +				vq->wrap_counter ^= 1;
>>>>> +			}
>
>>>>> +		}
>>>>> +	}
>>>>> +	for (; n < (out_sgs + in_sgs); n++) {
>>>>> +		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
>>>>> +			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
>>>>> +			if (vring_mapping_error(vq, addr))
>>>>> +				goto unmap_release;
>>>>> +
>>>>> +			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
>>>>> +					VRING_DESC_F_WRITE |
>>>>> +					VRING_DESC_F_AVAIL(vq->wrap_counter) |
>>>>> +					VRING_DESC_F_USED(!vq->wrap_counter));
>>>>> +			if (!indirect && i == head)
>>>>> +				head_flags = flags;
>>>>> +			else
>>>>> +				desc[i].flags = flags;
>>>>> +
>>>>> +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
>>>>> +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
>>>>> +			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
>>>>> +			prev = i;
>>>>> +			i++;
>>>>> +			if (!indirect && i >= vq->vring_packed.num) {
>>>>> +				i = 0;
>>>>> +				vq->wrap_counter ^= 1;
>>>>> +			}
>>>>> +		}
>>>>> +	}
>>>>> +	/* Last one doesn't continue. */
>>>>> +	if (!indirect && (head + 1) % vq->vring_packed.num == i)
>>>>> +		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
>>>> I can't get the why we need this here.
>>> If only one desc is used, we will need to clear the
>>> VRING_DESC_F_NEXT flag from the head_flags.
>> Yes, I meant why following desc[prev].flags won't work for this?
> Because the update of desc[head].flags (in above case,
> prev == head) has been delayed. The flags is saved in
> head_flags.

Ok, but let's try to avoid modular here e.g tracking the number of sgs 
in a counter.

And I see lots of duplication in the above two loops, I believe we can 
unify them with a a single loop. the only difference is dma direction 
and write flag.

>
>>>>> +	else
>>>>> +		desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
>>>>> +
>>>>> +	if (indirect) {
>>>>> +		/* FIXME: to be implemented */
>>>>> +
>>>>> +		/* Now that the indirect table is filled in, map it. */
>>>>> +		dma_addr_t addr = vring_map_single(
>>>>> +			vq, desc, total_sg * sizeof(struct vring_packed_desc),
>>>>> +			DMA_TO_DEVICE);
>>>>> +		if (vring_mapping_error(vq, addr))
>>>>> +			goto unmap_release;
>>>>> +
>>>>> +		head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
>>>>> +					     VRING_DESC_F_AVAIL(wrap_counter) |
>>>>> +					     VRING_DESC_F_USED(!wrap_counter));
>>>>> +		vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
>>>>> +		vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
>>>>> +				total_sg * sizeof(struct vring_packed_desc));
>>>>> +		vq->vring_packed.desc[head].id = cpu_to_virtio32(_vq->vdev, head);
>>>>> +	}
>>>>> +
>>>>> +	/* We're using some buffers from the free list. */
>>>>> +	vq->vq.num_free -= descs_used;
>>>>> +
>>>>> +	/* Update free pointer */
>>>>> +	if (indirect) {
>>>>> +		n = head + 1;
>>>>> +		if (n >= vq->vring_packed.num) {
>>>>> +			n = 0;
>>>>> +			vq->wrap_counter ^= 1;
>>>>> +		}
>>>>> +		vq->free_head = n;
>>>> detach_buf_packed() does not even touch free_head here, so need to explain
>>>> its meaning for packed ring.
>>> Above code is for indirect support which isn't really
>>> implemented in this patch yet.
>>>
>>> For your question, free_head stores the index of the
>>> next avail desc. I'll add a comment for it or move it
>>> to union and give it a better name in next version.
>> Yes, something like avail_idx might be better.
>>
>>>>> +	} else
>>>>> +		vq->free_head = i;
>>>> ID is only valid in the last descriptor in the list, so head + 1 should be
>>>> ok too?
>>> I don't really get your point. The vq->free_head stores
>>> the index of the next avail desc.
>> I think I get your idea now, free_head has two meanings:
>>
>> - next avail index
>> - buffer id
> In my design, free_head is just the index of the next
> avail desc.
>
> Driver can set anything to buffer ID.

Then you need another method to track id to context e.g hashing.

>   And in my design,
> I save desc index in buffer ID.
>
> I'll add comments for them.
>
>> If I'm correct, let's better add a comment for this.
>>
>>>>> +
>>>>> +	/* Store token and indirect buffer state. */
>>>>> +	vq->desc_state[head].num = descs_used;
>>>>> +	vq->desc_state[head].data = data;
>>>>> +	if (indirect)
>>>>> +		vq->desc_state[head].indir_desc = desc;
>>>>> +	else
>>>>> +		vq->desc_state[head].indir_desc = ctx;
>>>>> +
>>>>> +	virtio_wmb(vq->weak_barriers);
>>>> Let's add a comment to explain the barrier here.
>>> Okay.
>>>
>>>>> +	vq->vring_packed.desc[head].flags = head_flags;
>>>>> +	vq->num_added++;
>>>>> +
>>>>> +	pr_debug("Added buffer head %i to %p\n", head, vq);
>>>>> +	END_USE(vq);
>>>>> +
>>>>> +	return 0;
>>>>> +
>>>>> +unmap_release:
>>>>> +	err_idx = i;
>>>>> +	i = head;
>>>>> +
>>>>> +	for (n = 0; n < total_sg; n++) {
>>>>> +		if (i == err_idx)
>>>>> +			break;
>>>>> +		vring_unmap_one(vq, &desc[i]);
>>>>> +		i++;
>>>>> +		if (!indirect && i >= vq->vring_packed.num)
>>>>> +			i = 0;
>>>>> +	}
>>>>> +
>>>>> +	vq->wrap_counter = wrap_counter;
>>>>> +
>>>>> +	if (indirect)
>>>>> +		kfree(desc);
>>>>> +
>>>>> +	END_USE(vq);
>>>>> +	return -EIO;
>>>>> +}
> [...]
>>>>> @@ -1096,17 +1599,21 @@ struct virtqueue *vring_create_virtqueue(
>>>>>     	if (!queue) {
>>>>>     		/* Try to get a single page. You are my only hope! */
>>>>> -		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
>>>>> +		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
>>>>> +							     packed),
>>>>>     					  &dma_addr, GFP_KERNEL|__GFP_ZERO);
>>>>>     	}
>>>>>     	if (!queue)
>>>>>     		return NULL;
>>>>> -	queue_size_in_bytes = vring_size(num, vring_align);
>>>>> -	vring_init(&vring, num, queue, vring_align);
>>>>> +	queue_size_in_bytes = __vring_size(num, vring_align, packed);
>>>>> +	if (packed)
>>>>> +		vring_packed_init(&vring.vring_packed, num, queue, vring_align);
>>>>> +	else
>>>>> +		vring_init(&vring.vring_split, num, queue, vring_align);
>>>> Let's rename vring_init to vring_init_split() like other helpers?
>>> The vring_init() is a public API in include/uapi/linux/virtio_ring.h.
>>> I don't think we can rename it.
>> I see, then this need more thoughts to unify the API.
> My thought is to keep the old API as is, and introduce
> new types and helpers for packed ring.

I admit it's not a fault of this patch. But we'd better think of this in 
the future, consider we may have new kinds of ring.

>
> More details can be found in this patch:
> https://lkml.org/lkml/2018/2/23/243
> (PS. The type which has bit fields is just for reference,
>   and will be changed in next version.)
>
> Do you have any other suggestions?

No.

Thanks

>
> Best regards,
> Tiwei Bie
>
>>>>> -	vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
>>>>> -				   notify, callback, name);
>>>>> +	vq = __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
>>>>> +				   context, notify, callback, name);
>>>>>     	if (!vq) {
>>>>>     		vring_free_queue(vdev, queue_size_in_bytes, queue,
>>>>>     				 dma_addr);
> [...]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH RFC 2/2] virtio_ring: support packed ring
From: Tiwei Bie @ 2018-03-16 10:04 UTC (permalink / raw)
  To: Jason Wang; +Cc: mst, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <02a3da02-8226-ba4e-1b47-d25755b2c429@redhat.com>

On Fri, Mar 16, 2018 at 04:34:28PM +0800, Jason Wang wrote:
> On 2018年03月16日 15:40, Tiwei Bie wrote:
> > On Fri, Mar 16, 2018 at 02:44:12PM +0800, Jason Wang wrote:
> > > On 2018年03月16日 14:10, Tiwei Bie wrote:
> > > > On Fri, Mar 16, 2018 at 12:03:25PM +0800, Jason Wang wrote:
> > > > > On 2018年02月23日 19:18, Tiwei Bie wrote:
> > > > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > > > ---
> > > > > >     drivers/virtio/virtio_ring.c | 699 +++++++++++++++++++++++++++++++++++++------
> > > > > >     include/linux/virtio_ring.h  |   8 +-
> > > > > >     2 files changed, 618 insertions(+), 89 deletions(-)
> > [...]
> > > > > >     			      cpu_addr, size, direction);
> > > > > >     }
> > > > > > -static void vring_unmap_one(const struct vring_virtqueue *vq,
> > > > > > -			    struct vring_desc *desc)
> > > > > > +static void vring_unmap_one(const struct vring_virtqueue *vq, void *_desc)
> > > > > >     {
> > > > > Let's split the helpers to packed/split version like other helpers?
> > > > > (Consider the caller has already known the type of vq).
> > > > Okay.
> > > > 
> > > [...]
> > > 
> > > > > > +				desc[i].flags = flags;
> > > > > > +
> > > > > > +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> > > > > > +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> > > > > > +			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
> > > > > If it's a part of chain, we only need to do this for last buffer I think.
> > > > I'm not sure I've got your point about the "last buffer".
> > > > But, yes, id just needs to be set for the last desc.
> > > Right, I think I meant "last descriptor" :)
> > > 
> > > > > > +			prev = i;
> > > > > > +			i++;
> > > > > It looks to me prev is always i - 1?
> > > > No. prev will be (vq->vring_packed.num - 1) when i becomes 0.
> > > Right, so prev = i ? i - 1 : vq->vring_packed.num - 1.
> > Yes, i wraps together with vq->wrap_counter in following code:
> > 
> > > > > > +			if (!indirect && i >= vq->vring_packed.num) {
> > > > > > +				i = 0;
> > > > > > +				vq->wrap_counter ^= 1;
> > > > > > +			}
> > 
> > > > > > +		}
> > > > > > +	}
> > > > > > +	for (; n < (out_sgs + in_sgs); n++) {
> > > > > > +		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > > > > > +			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
> > > > > > +			if (vring_mapping_error(vq, addr))
> > > > > > +				goto unmap_release;
> > > > > > +
> > > > > > +			flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
> > > > > > +					VRING_DESC_F_WRITE |
> > > > > > +					VRING_DESC_F_AVAIL(vq->wrap_counter) |
> > > > > > +					VRING_DESC_F_USED(!vq->wrap_counter));
> > > > > > +			if (!indirect && i == head)
> > > > > > +				head_flags = flags;
> > > > > > +			else
> > > > > > +				desc[i].flags = flags;
> > > > > > +
> > > > > > +			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> > > > > > +			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> > > > > > +			desc[i].id = cpu_to_virtio32(_vq->vdev, head);
> > > > > > +			prev = i;
> > > > > > +			i++;
> > > > > > +			if (!indirect && i >= vq->vring_packed.num) {
> > > > > > +				i = 0;
> > > > > > +				vq->wrap_counter ^= 1;
> > > > > > +			}
> > > > > > +		}
> > > > > > +	}
> > > > > > +	/* Last one doesn't continue. */
> > > > > > +	if (!indirect && (head + 1) % vq->vring_packed.num == i)
> > > > > > +		head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> > > > > I can't get the why we need this here.
> > > > If only one desc is used, we will need to clear the
> > > > VRING_DESC_F_NEXT flag from the head_flags.
> > > Yes, I meant why following desc[prev].flags won't work for this?
> > Because the update of desc[head].flags (in above case,
> > prev == head) has been delayed. The flags is saved in
> > head_flags.
> 
> Ok, but let's try to avoid modular here e.g tracking the number of sgs in a
> counter.
> 
> And I see lots of duplication in the above two loops, I believe we can unify
> them with a a single loop. the only difference is dma direction and write
> flag.

The above implementation for packed ring is basically
an mirror of the existing implementation in split ring
as I want to keep the coding style consistent. Below
is the corresponding code in split ring:

static inline int virtqueue_add(struct virtqueue *_vq,
				struct scatterlist *sgs[],
				unsigned int total_sg,
				unsigned int out_sgs,
				unsigned int in_sgs,
				void *data,
				void *ctx,
				gfp_t gfp)
{
	......

	for (n = 0; n < out_sgs; n++) {
		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
			if (vring_mapping_error(vq, addr))
				goto unmap_release;

			desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT);
			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
			prev = i;
			i = virtio16_to_cpu(_vq->vdev, desc[i].next);
		}
	}
	for (; n < (out_sgs + in_sgs); n++) {
		for (sg = sgs[n]; sg; sg = sg_next(sg)) {
			dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
			if (vring_mapping_error(vq, addr))
				goto unmap_release;

			desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE);
			desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
			desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
			prev = i;
			i = virtio16_to_cpu(_vq->vdev, desc[i].next);
		}
	}

	......
}

> 
> > 
> > > > > > +	else
> > > > > > +		desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> > > > > > +
> > > > > > +	if (indirect) {
> > > > > > +		/* FIXME: to be implemented */
> > > > > > +
> > > > > > +		/* Now that the indirect table is filled in, map it. */
> > > > > > +		dma_addr_t addr = vring_map_single(
> > > > > > +			vq, desc, total_sg * sizeof(struct vring_packed_desc),
> > > > > > +			DMA_TO_DEVICE);
> > > > > > +		if (vring_mapping_error(vq, addr))
> > > > > > +			goto unmap_release;
> > > > > > +
> > > > > > +		head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
> > > > > > +					     VRING_DESC_F_AVAIL(wrap_counter) |
> > > > > > +					     VRING_DESC_F_USED(!wrap_counter));
> > > > > > +		vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
> > > > > > +		vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
> > > > > > +				total_sg * sizeof(struct vring_packed_desc));
> > > > > > +		vq->vring_packed.desc[head].id = cpu_to_virtio32(_vq->vdev, head);
> > > > > > +	}
> > > > > > +
> > > > > > +	/* We're using some buffers from the free list. */
> > > > > > +	vq->vq.num_free -= descs_used;
> > > > > > +
> > > > > > +	/* Update free pointer */
> > > > > > +	if (indirect) {
> > > > > > +		n = head + 1;
> > > > > > +		if (n >= vq->vring_packed.num) {
> > > > > > +			n = 0;
> > > > > > +			vq->wrap_counter ^= 1;
> > > > > > +		}
> > > > > > +		vq->free_head = n;
> > > > > detach_buf_packed() does not even touch free_head here, so need to explain
> > > > > its meaning for packed ring.
> > > > Above code is for indirect support which isn't really
> > > > implemented in this patch yet.
> > > > 
> > > > For your question, free_head stores the index of the
> > > > next avail desc. I'll add a comment for it or move it
> > > > to union and give it a better name in next version.
> > > Yes, something like avail_idx might be better.
> > > 
> > > > > > +	} else
> > > > > > +		vq->free_head = i;
> > > > > ID is only valid in the last descriptor in the list, so head + 1 should be
> > > > > ok too?
> > > > I don't really get your point. The vq->free_head stores
> > > > the index of the next avail desc.
> > > I think I get your idea now, free_head has two meanings:
> > > 
> > > - next avail index
> > > - buffer id
> > In my design, free_head is just the index of the next
> > avail desc.
> > 
> > Driver can set anything to buffer ID.
> 
> Then you need another method to track id to context e.g hashing.

I keep the context in desc_state[desc_idx]. So there is
no extra method needed to track the context.

> 
> >   And in my design,
> > I save desc index in buffer ID.
> > 
> > I'll add comments for them.
> > 
> > > If I'm correct, let's better add a comment for this.
> > > 
> > > > > > +
> > > > > > +	/* Store token and indirect buffer state. */
> > > > > > +	vq->desc_state[head].num = descs_used;
> > > > > > +	vq->desc_state[head].data = data;
> > > > > > +	if (indirect)
> > > > > > +		vq->desc_state[head].indir_desc = desc;
> > > > > > +	else
> > > > > > +		vq->desc_state[head].indir_desc = ctx;
> > > > > > +
> > > > > > +	virtio_wmb(vq->weak_barriers);
> > > > > Let's add a comment to explain the barrier here.
> > > > Okay.
> > > > 
> > > > > > +	vq->vring_packed.desc[head].flags = head_flags;
> > > > > > +	vq->num_added++;
> > > > > > +
> > > > > > +	pr_debug("Added buffer head %i to %p\n", head, vq);
> > > > > > +	END_USE(vq);
> > > > > > +
> > > > > > +	return 0;
> > > > > > +
> > > > > > +unmap_release:
> > > > > > +	err_idx = i;
> > > > > > +	i = head;
> > > > > > +
> > > > > > +	for (n = 0; n < total_sg; n++) {
> > > > > > +		if (i == err_idx)
> > > > > > +			break;
> > > > > > +		vring_unmap_one(vq, &desc[i]);
> > > > > > +		i++;
> > > > > > +		if (!indirect && i >= vq->vring_packed.num)
> > > > > > +			i = 0;
> > > > > > +	}
> > > > > > +
> > > > > > +	vq->wrap_counter = wrap_counter;
> > > > > > +
> > > > > > +	if (indirect)
> > > > > > +		kfree(desc);
> > > > > > +
> > > > > > +	END_USE(vq);
> > > > > > +	return -EIO;
> > > > > > +}
> > [...]
> > > > > > @@ -1096,17 +1599,21 @@ struct virtqueue *vring_create_virtqueue(
> > > > > >     	if (!queue) {
> > > > > >     		/* Try to get a single page. You are my only hope! */
> > > > > > -		queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
> > > > > > +		queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
> > > > > > +							     packed),
> > > > > >     					  &dma_addr, GFP_KERNEL|__GFP_ZERO);
> > > > > >     	}
> > > > > >     	if (!queue)
> > > > > >     		return NULL;
> > > > > > -	queue_size_in_bytes = vring_size(num, vring_align);
> > > > > > -	vring_init(&vring, num, queue, vring_align);
> > > > > > +	queue_size_in_bytes = __vring_size(num, vring_align, packed);
> > > > > > +	if (packed)
> > > > > > +		vring_packed_init(&vring.vring_packed, num, queue, vring_align);
> > > > > > +	else
> > > > > > +		vring_init(&vring.vring_split, num, queue, vring_align);
> > > > > Let's rename vring_init to vring_init_split() like other helpers?
> > > > The vring_init() is a public API in include/uapi/linux/virtio_ring.h.
> > > > I don't think we can rename it.
> > > I see, then this need more thoughts to unify the API.
> > My thought is to keep the old API as is, and introduce
> > new types and helpers for packed ring.
> 
> I admit it's not a fault of this patch. But we'd better think of this in the
> future, consider we may have new kinds of ring.
> 
> > 
> > More details can be found in this patch:
> > https://lkml.org/lkml/2018/2/23/243
> > (PS. The type which has bit fields is just for reference,
> >   and will be changed in next version.)
> > 
> > Do you have any other suggestions?
> 
> No.

Hmm.. Sorry, I didn't describe my question well.
I mean do you have any suggestions about the API
design for packed ring in uapi header? Currently
I introduced below two new helpers:

static inline void vring_packed_init(struct vring_packed *vr, unsigned int num,
				     void *p, unsigned long align);
static inline unsigned vring_packed_size(unsigned int num, unsigned long align);

When new rings are introduced in the future, above
helpers can't be reused. Maybe we should make the
helpers be able to determine the ring type?

Best regards,
Tiwei Bie

> 
> Thanks
> 
> > 
> > Best regards,
> > Tiwei Bie
> > 
> > > > > > -	vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
> > > > > > -				   notify, callback, name);
> > > > > > +	vq = __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
> > > > > > +				   context, notify, callback, name);
> > > > > >     	if (!vq) {
> > > > > >     		vring_free_queue(vdev, queue_size_in_bytes, queue,
> > > > > >     				 dma_addr);
> > [...]
> 
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox