Linux Documentation
 help / color / mirror / Atom feed
* [PATCH v5 4/5] KVM: selftests: Add option for different backing in pre-fault tests
From: Jack Thomson @ 2026-06-12 16:23 UTC (permalink / raw)
  To: maz, oupton, pbonzini
  Cc: joey.gouly, seiden, suzuki.poulose, yuzenghui, catalin.marinas,
	will, shuah, corbet, vladimir.murzin, linux-arm-kernel, kvmarm,
	kvm, linux-kernel, linux-kselftest, linux-doc, isaku.yamahata,
	Jack Thomson
In-Reply-To: <20260612162354.73378-1-jackabt.amazon@gmail.com>

From: Jack Thomson <jackabt@amazon.com>

Add a -s option to specify different memory backing types for the
pre-fault tests (e.g. anonymous, hugetlb), allowing testing of the
pre-fault functionality across different memory configurations.

Signed-off-by: Jack Thomson <jackabt@amazon.com>
---
 .../selftests/kvm/pre_fault_memory_test.c     | 51 +++++++++++++------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/tools/testing/selftests/kvm/pre_fault_memory_test.c b/tools/testing/selftests/kvm/pre_fault_memory_test.c
index 9f5f0d1a5db1..c850cf28e86a 100644
--- a/tools/testing/selftests/kvm/pre_fault_memory_test.c
+++ b/tools/testing/selftests/kvm/pre_fault_memory_test.c
@@ -45,6 +45,7 @@ struct slot_worker_data {
 	struct kvm_vm *vm;
 	gpa_t gpa;
 	u32 flags;
+	enum vm_mem_backing_src_type mem_backing_src;
 	bool worker_ready;
 	bool prefault_ready;
 	bool recreate_slot;
@@ -65,14 +66,16 @@ static void *delete_slot_worker(void *__data)
 	while (!READ_ONCE(data->recreate_slot))
 		cpu_relax();
 
-	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, data->gpa,
+	vm_userspace_mem_region_add(vm, data->mem_backing_src, data->gpa,
 				    TEST_SLOT, test_config.test_num_pages, data->flags);
 
 	return NULL;
 }
 
 static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 base_gpa, u64 offset,
-			     u64 size, u64 expected_left, bool private)
+			     u64 size, u64 expected_left,
+			     enum vm_mem_backing_src_type mem_backing_src,
+			     bool private)
 {
 	struct kvm_pre_fault_memory range = {
 		.gpa = base_gpa + offset,
@@ -83,6 +86,7 @@ static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 base_gpa, u64 offset,
 		.vm = vcpu->vm,
 		.gpa = base_gpa,
 		.flags = private ? KVM_MEM_GUEST_MEMFD : 0,
+		.mem_backing_src = mem_backing_src,
 	};
 	bool slot_recreated = false;
 	pthread_t slot_worker;
@@ -172,11 +176,13 @@ static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 base_gpa, u64 offset,
 struct test_params {
 	unsigned long vm_type;
 	bool private;
+	enum vm_mem_backing_src_type mem_backing_src;
 };
 
 static void __test_pre_fault_memory(enum vm_guest_mode guest_mode, void *arg)
 {
 	gpa_t gpa, gva, alignment, guest_page_size, host_page_size;
+	gpa_t backing_src_pagesz, mem_page_size;
 	struct test_params *p = arg;
 	const struct vm_shape shape = {
 		.mode = guest_mode,
@@ -188,24 +194,28 @@ static void __test_pre_fault_memory(enum vm_guest_mode guest_mode, void *arg)
 	struct ucall uc;
 
 	pr_info("Testing guest mode: %s\n", vm_guest_mode_string(guest_mode));
+	pr_info("Testing memory backing src type: %s\n",
+		vm_mem_backing_src_alias(p->mem_backing_src)->name);
 
 	vm = vm_create_shape_with_one_vcpu(shape, &vcpu, guest_code);
 
 	guest_page_size = vm_guest_mode_params[guest_mode].page_size;
 	host_page_size = getpagesize();
+	backing_src_pagesz = get_backing_src_pagesz(p->mem_backing_src);
+	mem_page_size = max(host_page_size, backing_src_pagesz);
 
 	test_config.page_size = guest_page_size;
 	test_config.test_size = align_up(TEST_BASE_SIZE + test_config.page_size,
-					 host_page_size);
+					 mem_page_size);
 	test_config.test_num_pages = vm_calc_num_guest_pages(vm->mode, test_config.test_size);
 
 	gpa = (vm->max_gfn - test_config.test_num_pages) * test_config.page_size;
 	alignment = SZ_2M;
-	alignment = max(alignment, host_page_size);
+	alignment = max(alignment, mem_page_size);
 	gpa = align_down(gpa, alignment);
 	gva = gpa & ((1ULL << (vm->va_bits - 1)) - 1);
 
-	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+	vm_userspace_mem_region_add(vm, p->mem_backing_src,
 				    gpa, TEST_SLOT, test_config.test_num_pages,
 				    p->private ? KVM_MEM_GUEST_MEMFD : 0);
 	virt_map(vm, gva, gpa, test_config.test_num_pages);
@@ -213,14 +223,18 @@ static void __test_pre_fault_memory(enum vm_guest_mode guest_mode, void *arg)
 	if (p->private)
 		vm_mem_set_private(vm, gpa, test_config.test_size);
 
-	pre_fault_memory(vcpu, gpa, 0, test_config.test_size, 0, p->private);
+	pre_fault_memory(vcpu, gpa, 0, test_config.test_size, 0,
+			 p->mem_backing_src, p->private);
 	/* Retry the same range after the first prefault attempt. */
-	pre_fault_memory(vcpu, gpa, 0, test_config.test_size, 0, p->private);
+	pre_fault_memory(vcpu, gpa, 0, test_config.test_size, 0,
+			 p->mem_backing_src, p->private);
 	pre_fault_memory(vcpu, gpa,
 			 test_config.test_size - host_page_size,
-			 host_page_size * 2, host_page_size, p->private);
+			 host_page_size * 2, host_page_size,
+			 p->mem_backing_src, p->private);
 	pre_fault_memory(vcpu, gpa, test_config.test_size,
-			 host_page_size, host_page_size, p->private);
+			 host_page_size, host_page_size,
+			 p->mem_backing_src, p->private);
 
 	vcpu_args_set(vcpu, 1, gva);
 
@@ -249,11 +263,13 @@ static void __test_pre_fault_memory(enum vm_guest_mode guest_mode, void *arg)
 	kvm_vm_free(vm);
 }
 
-static void test_pre_fault_memory(unsigned long vm_type, bool private)
+static void test_pre_fault_memory(unsigned long vm_type, enum vm_mem_backing_src_type backing_src,
+				  bool private)
 {
 	struct test_params p = {
 		.vm_type = vm_type,
 		.private = private,
+		.mem_backing_src = backing_src,
 	};
 
 	if (vm_type && !(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(vm_type))) {
@@ -267,23 +283,28 @@ static void test_pre_fault_memory(unsigned long vm_type, bool private)
 static void help(char *name)
 {
 	puts("");
-	printf("usage: %s [-h] [-m mode]\n", name);
+	printf("usage: %s [-h] [-m mode] [-s mem-type]\n", name);
 	puts("");
 	guest_modes_help();
+	backing_src_help("-s");
 	puts("");
 }
 
 int main(int argc, char *argv[])
 {
+	enum vm_mem_backing_src_type backing = DEFAULT_VM_MEM_SRC;
 	int opt;
 
 	guest_modes_append_default();
 
-	while ((opt = getopt(argc, argv, "hm:")) != -1) {
+	while ((opt = getopt(argc, argv, "hm:s:")) != -1) {
 		switch (opt) {
 		case 'm':
 			guest_modes_cmdline(optarg);
 			break;
+		case 's':
+			backing = parse_backing_src_type(optarg);
+			break;
 		case 'h':
 		default:
 			help(argv[0]);
@@ -293,10 +314,10 @@ int main(int argc, char *argv[])
 
 	TEST_REQUIRE(kvm_check_cap(KVM_CAP_PRE_FAULT_MEMORY));
 
-	test_pre_fault_memory(0, false);
+	test_pre_fault_memory(0, backing, false);
 #ifdef __x86_64__
-	test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, false);
-	test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, true);
+	test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, backing, false);
+	test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, backing, true);
 #endif
 	return 0;
 }
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 3/5] KVM: selftests: Enable pre_fault_memory_test for arm64
From: Jack Thomson @ 2026-06-12 16:23 UTC (permalink / raw)
  To: maz, oupton, pbonzini
  Cc: joey.gouly, seiden, suzuki.poulose, yuzenghui, catalin.marinas,
	will, shuah, corbet, vladimir.murzin, linux-arm-kernel, kvmarm,
	kvm, linux-kernel, linux-kselftest, linux-doc, isaku.yamahata,
	Jack Thomson
In-Reply-To: <20260612162354.73378-1-jackabt.amazon@gmail.com>

From: Jack Thomson <jackabt@amazon.com>

Enable the pre_fault_memory_test to run on arm64 by making it work with
different guest page sizes and testing multiple guest configurations.

Update the test_assert to compare against the UCALL_EXIT_REASON, for
portability, as arm64 exits with KVM_EXIT_MMIO while x86 uses
KVM_EXIT_IO.

Signed-off-by: Jack Thomson <jackabt@amazon.com>
---
 tools/testing/selftests/kvm/Makefile.kvm      |   1 +
 .../selftests/kvm/pre_fault_memory_test.c     | 115 ++++++++++++++----
 2 files changed, 92 insertions(+), 24 deletions(-)

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index 9118a5a51b89..4609d8f23e38 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -194,6 +194,7 @@ TEST_GEN_PROGS_arm64 += guest_memfd_test
 TEST_GEN_PROGS_arm64 += mmu_stress_test
 TEST_GEN_PROGS_arm64 += rseq_test
 TEST_GEN_PROGS_arm64 += steal_time
+TEST_GEN_PROGS_arm64 += pre_fault_memory_test
 
 TEST_GEN_PROGS_s390 = $(TEST_GEN_PROGS_COMMON)
 TEST_GEN_PROGS_s390 += s390/memop
diff --git a/tools/testing/selftests/kvm/pre_fault_memory_test.c b/tools/testing/selftests/kvm/pre_fault_memory_test.c
index fcb57fd034e6..9f5f0d1a5db1 100644
--- a/tools/testing/selftests/kvm/pre_fault_memory_test.c
+++ b/tools/testing/selftests/kvm/pre_fault_memory_test.c
@@ -11,19 +11,29 @@
 #include <kvm_util.h>
 #include <processor.h>
 #include <pthread.h>
+#include <guest_modes.h>
 
 /* Arbitrarily chosen values */
-#define TEST_SIZE		(SZ_2M + PAGE_SIZE)
-#define TEST_NPAGES		(TEST_SIZE / PAGE_SIZE)
+#define TEST_BASE_SIZE		SZ_2M
 #define TEST_SLOT		10
 
+/* Storage of test info to share with guest code */
+struct test_config {
+	u64 page_size;
+	u64 test_size;
+	u64 test_num_pages;
+};
+
+static struct test_config test_config;
+
 static void guest_code(u64 base_gva)
 {
 	volatile u64 val __used;
+	struct test_config *config = &test_config;
 	int i;
 
-	for (i = 0; i < TEST_NPAGES; i++) {
-		u64 *src = (u64 *)(base_gva + i * PAGE_SIZE);
+	for (i = 0; i < config->test_num_pages; i++) {
+		u64 *src = (u64 *)(base_gva + i * config->page_size);
 
 		val = *src;
 	}
@@ -56,7 +66,7 @@ static void *delete_slot_worker(void *__data)
 		cpu_relax();
 
 	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, data->gpa,
-				    TEST_SLOT, TEST_NPAGES, data->flags);
+				    TEST_SLOT, test_config.test_num_pages, data->flags);
 
 	return NULL;
 }
@@ -149,8 +159,8 @@ static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 base_gpa, u64 offset,
 	/*
 	 * Assert success if prefaulting the entire range should succeed, i.e.
 	 * complete with no bytes remaining.  Otherwise prefaulting should have
-	 * failed due to ENOENT (due to RET_PF_EMULATE for emulated MMIO when
-	 * no memslot exists).
+	 * failed due to ENOENT (no memslot exists for the GPA; on x86 this
+	 * surfaces via RET_PF_EMULATE).
 	 */
 	if (!expected_left)
 		TEST_ASSERT_VM_VCPU_IOCTL(!ret, KVM_PRE_FAULT_MEMORY, ret, vcpu->vm);
@@ -159,43 +169,70 @@ static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 base_gpa, u64 offset,
 					  KVM_PRE_FAULT_MEMORY, ret, vcpu->vm);
 }
 
-static void __test_pre_fault_memory(unsigned long vm_type, bool private)
+struct test_params {
+	unsigned long vm_type;
+	bool private;
+};
+
+static void __test_pre_fault_memory(enum vm_guest_mode guest_mode, void *arg)
 {
-	gpa_t gpa, gva, alignment, guest_page_size;
+	gpa_t gpa, gva, alignment, guest_page_size, host_page_size;
+	struct test_params *p = arg;
 	const struct vm_shape shape = {
-		.mode = VM_MODE_DEFAULT,
-		.type = vm_type,
+		.mode = guest_mode,
+		.type = p->vm_type,
 	};
 	struct kvm_vcpu *vcpu;
 	struct kvm_run *run;
 	struct kvm_vm *vm;
 	struct ucall uc;
 
+	pr_info("Testing guest mode: %s\n", vm_guest_mode_string(guest_mode));
+
 	vm = vm_create_shape_with_one_vcpu(shape, &vcpu, guest_code);
 
-	alignment = guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size;
-	gpa = (vm->max_gfn - TEST_NPAGES) * guest_page_size;
+	guest_page_size = vm_guest_mode_params[guest_mode].page_size;
+	host_page_size = getpagesize();
+
+	test_config.page_size = guest_page_size;
+	test_config.test_size = align_up(TEST_BASE_SIZE + test_config.page_size,
+					 host_page_size);
+	test_config.test_num_pages = vm_calc_num_guest_pages(vm->mode, test_config.test_size);
+
+	gpa = (vm->max_gfn - test_config.test_num_pages) * test_config.page_size;
 	alignment = SZ_2M;
+	alignment = max(alignment, host_page_size);
 	gpa = align_down(gpa, alignment);
 	gva = gpa & ((1ULL << (vm->va_bits - 1)) - 1);
 
-	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa, TEST_SLOT,
-				    TEST_NPAGES, private ? KVM_MEM_GUEST_MEMFD : 0);
-	virt_map(vm, gva, gpa, TEST_NPAGES);
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+				    gpa, TEST_SLOT, test_config.test_num_pages,
+				    p->private ? KVM_MEM_GUEST_MEMFD : 0);
+	virt_map(vm, gva, gpa, test_config.test_num_pages);
 
-	if (private)
-		vm_mem_set_private(vm, gpa, TEST_SIZE);
+	if (p->private)
+		vm_mem_set_private(vm, gpa, test_config.test_size);
 
-	pre_fault_memory(vcpu, gpa, 0, SZ_2M, 0, private);
-	pre_fault_memory(vcpu, gpa, SZ_2M, PAGE_SIZE * 2, PAGE_SIZE, private);
-	pre_fault_memory(vcpu, gpa, TEST_SIZE, PAGE_SIZE, PAGE_SIZE, private);
+	pre_fault_memory(vcpu, gpa, 0, test_config.test_size, 0, p->private);
+	/* Retry the same range after the first prefault attempt. */
+	pre_fault_memory(vcpu, gpa, 0, test_config.test_size, 0, p->private);
+	pre_fault_memory(vcpu, gpa,
+			 test_config.test_size - host_page_size,
+			 host_page_size * 2, host_page_size, p->private);
+	pre_fault_memory(vcpu, gpa, test_config.test_size,
+			 host_page_size, host_page_size, p->private);
 
 	vcpu_args_set(vcpu, 1, gva);
+
+	/* Export the shared variables to the guest. */
+	sync_global_to_guest(vm, test_config);
+
 	vcpu_run(vcpu);
 
 	run = vcpu->run;
-	TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
-		    "Wanted KVM_EXIT_IO, got exit reason: %u (%s)",
+	TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON,
+		    "Wanted %s, got exit reason: %u (%s)",
+		    exit_reason_str(UCALL_EXIT_REASON),
 		    run->exit_reason, exit_reason_str(run->exit_reason));
 
 	switch (get_ucall(vcpu, &uc)) {
@@ -214,16 +251,46 @@ static void __test_pre_fault_memory(unsigned long vm_type, bool private)
 
 static void test_pre_fault_memory(unsigned long vm_type, bool private)
 {
+	struct test_params p = {
+		.vm_type = vm_type,
+		.private = private,
+	};
+
 	if (vm_type && !(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(vm_type))) {
 		pr_info("Skipping tests for vm_type 0x%lx\n", vm_type);
 		return;
 	}
 
-	__test_pre_fault_memory(vm_type, private);
+	for_each_guest_mode(__test_pre_fault_memory, &p);
+}
+
+static void help(char *name)
+{
+	puts("");
+	printf("usage: %s [-h] [-m mode]\n", name);
+	puts("");
+	guest_modes_help();
+	puts("");
 }
 
 int main(int argc, char *argv[])
 {
+	int opt;
+
+	guest_modes_append_default();
+
+	while ((opt = getopt(argc, argv, "hm:")) != -1) {
+		switch (opt) {
+		case 'm':
+			guest_modes_cmdline(optarg);
+			break;
+		case 'h':
+		default:
+			help(argv[0]);
+			exit(0);
+		}
+	}
+
 	TEST_REQUIRE(kvm_check_cap(KVM_CAP_PRE_FAULT_MEMORY));
 
 	test_pre_fault_memory(0, false);
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 2/5] KVM: arm64: Add pre_fault_memory implementation
From: Jack Thomson @ 2026-06-12 16:23 UTC (permalink / raw)
  To: maz, oupton, pbonzini
  Cc: joey.gouly, seiden, suzuki.poulose, yuzenghui, catalin.marinas,
	will, shuah, corbet, vladimir.murzin, linux-arm-kernel, kvmarm,
	kvm, linux-kernel, linux-kselftest, linux-doc, isaku.yamahata,
	Jack Thomson
In-Reply-To: <20260612162354.73378-1-jackabt.amazon@gmail.com>

From: Jack Thomson <jackabt@amazon.com>

Add arm64 support for KVM_PRE_FAULT_MEMORY by synthesizing a read data
abort and routing it through the existing stage-2 fault handlers. Treat
the requested GPA as an IPA in the userspace-owned VM's memslot space
and always target the canonical stage-2, even if the vCPU last ran with
a nested/shadow MMU selected.

If the vCPU last ran in a nested context, switch to the canonical
stage-2 with the vCPU put/load helpers so VMID, VNCR and shadow-MMU
refcount state stay consistent. Leave the switch in place for the ioctl;
vcpu_put() at ioctl exit drops the hw_mmu and the next vcpu_load()
reselects the correct MMU from vCPU state.

Check existing mappings with a shared page-table walk under the MMU read
lock, and use the resulting walk level when constructing the synthetic
fault. Report poisoned pages through the ioctl return path with
-EHWPOISON instead of also queueing SIGBUS, and use the installed
mapping size to advance the prefault range.

Advertise KVM_CAP_PRE_FAULT_MEMORY on arm64. Protected VMs remain
unsupported: pKVM filters the capability, and the ioctl returns
-EOPNOTSUPP if invoked anyway.

Signed-off-by: Jack Thomson <jackabt@amazon.com>
---
 Documentation/virt/kvm/api.rst |  18 +++-
 arch/arm64/kvm/Kconfig         |   1 +
 arch/arm64/kvm/arm.c           |   1 +
 arch/arm64/kvm/mmu.c           | 162 +++++++++++++++++++++++++++++++++
 4 files changed, 178 insertions(+), 4 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 52bbbb553ce1..657e05656fa6 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6462,7 +6462,7 @@ See KVM_SET_USER_MEMORY_REGION2 for additional details.
 ---------------------------
 
 :Capability: KVM_CAP_PRE_FAULT_MEMORY
-:Architectures: none
+:Architectures: x86, arm64
 :Type: vcpu ioctl
 :Parameters: struct kvm_pre_fault_memory (in/out)
 :Returns: 0 if at least one page is processed, < 0 on error
@@ -6470,11 +6470,14 @@ See KVM_SET_USER_MEMORY_REGION2 for additional details.
 Errors:
 
   ========== ===============================================================
+  EAGAIN     A memslot update raced with the ioctl before any page was
+             processed.
   EINVAL     The specified `gpa` and `size` were invalid (e.g. not
              page aligned, causes an overflow, or size is zero).
   ENOENT     The specified `gpa` is outside defined memslots.
   EINTR      An unmasked signal is pending and no page was processed.
   EFAULT     The parameter address was invalid.
+  EHWPOISON  A poisoned host page was encountered.
   EOPNOTSUPP Mapping memory for a GPA is unsupported by the
              hypervisor, and/or for the current vCPU state/mode.
   EIO        unexpected error conditions (also causes a WARN)
@@ -6494,7 +6497,14 @@ Errors:
 KVM_PRE_FAULT_MEMORY populates KVM's stage-2 page tables used to map memory
 for the current vCPU state.  KVM maps memory as if the vCPU generated a
 stage-2 read page fault, e.g. faults in memory as needed, but doesn't break
-CoW.  However, KVM does not mark any newly created stage-2 PTE as Accessed.
+CoW.  However, on x86, KVM does not mark any newly created stage-2 PTE as
+Accessed.  On arm64, newly created stage-2 PTEs are marked Accessed.
+
+On arm64, `gpa` is interpreted as an IPA in the userspace-owned VM's
+memslot address space.  If the vCPU most recently ran a nested guest, KVM
+still targets the VM's canonical stage-2, and does not interpret `gpa` as
+a nested guest IPA or target the nested/shadow stage-2 selected by the
+vCPU's last run state.
 
 In the case of confidential VM types where there is an initial set up of
 private guest memory before the guest is 'finalized'/measured, this ioctl
@@ -6507,9 +6517,9 @@ case, the ioctl can be called in parallel.
 
 When the ioctl returns, the input values are updated to point to the
 remaining range.  If `size` > 0 on return, the caller can just issue
-the ioctl again with the same `struct kvm_map_memory` argument.
+the ioctl again with the same `struct kvm_pre_fault_memory` argument.
 
-Shadow page tables cannot support this ioctl because they
+On x86, shadow page tables cannot support this ioctl because they
 are indexed by virtual address or nested guest physical address.
 Calling this ioctl when the guest is using shadow page tables (for
 example because it is running a nested guest with nested page tables)
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 449154f9a485..6b89262e8ba7 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -24,6 +24,7 @@ menuconfig KVM
 	select HAVE_KVM_CPU_RELAX_INTERCEPT
 	select KVM_MMIO
 	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
+	select KVM_GENERIC_PRE_FAULT_MEMORY
 	select VIRT_XFER_TO_GUEST_WORK
 	select KVM_VFIO
 	select HAVE_KVM_DIRTY_RING_ACQ_REL
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 9453321ef8c6..dcb92bee13af 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -392,6 +392,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_COUNTER_OFFSET:
 	case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS:
 	case KVM_CAP_ARM_SEA_TO_USER:
+	case KVM_CAP_PRE_FAULT_MEMORY:
 		r = 1;
 		break;
 	case KVM_CAP_SET_GUEST_DEBUG2:
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index c720f07cb82e..4bf048bbcf8b 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1571,6 +1571,8 @@ struct kvm_s2_fault_desc {
 	struct kvm_s2_trans	*nested;
 	struct kvm_memory_slot	*memslot;
 	unsigned long		hva;
+	unsigned long		*page_size;
+	bool			prefault;
 };
 
 static int gmem_abort(const struct kvm_s2_fault_desc *s2fd)
@@ -1882,6 +1884,13 @@ static int kvm_s2_fault_pin_pfn(const struct kvm_s2_fault_desc *s2fd,
 				      &s2vi->map_writable, &s2vi->page);
 	if (unlikely(is_error_noslot_pfn(s2vi->pfn))) {
 		if (s2vi->pfn == KVM_PFN_ERR_HWPOISON) {
+			/*
+			 * When prefaulting, report the poison via -EHWPOISON
+			 * only; don't also queue a SIGBUS as the run path
+			 * does for the faulting vCPU thread.
+			 */
+			if (s2fd->prefault)
+				return -EHWPOISON;
 			kvm_send_hwpoison_signal(s2fd->hva, __ffs(s2vi->vma_pagesize));
 			return 0;
 		}
@@ -2053,6 +2062,9 @@ static int kvm_s2_fault_map(const struct kvm_s2_fault_desc *s2fd,
 	kvm_release_faultin_page(kvm, s2vi->page, !!ret, writable);
 	kvm_fault_unlock(kvm);
 
+	if (s2fd->page_size && !ret)
+		*s2fd->page_size = mapping_size;
+
 	/*
 	 * Mark the page dirty only if the fault is handled successfully,
 	 * making sure we adjust the canonical IPA if the mapping size has
@@ -2757,3 +2769,153 @@ void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
 
 	trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
 }
+
+/*
+ * Prefaulting always targets the canonical stage-2.  If the vCPU last ran
+ * in a nested context, swap in the canonical MMU via the vCPU put/load
+ * helpers so that preemption, VMID, VNCR fixmap and shadow-MMU refcount
+ * state stay consistent.
+ *
+ * The swap is deliberately not undone: nothing runs in between the
+ * per-page invocations of kvm_arch_vcpu_pre_fault_memory() except the
+ * generic prefault loop, and the vcpu_put() at ioctl exit discards
+ * vcpu->arch.hw_mmu anyway (see kvm_vcpu_put_hw_mmu()), so the next
+ * vcpu_load() re-derives the correct MMU from the vCPU's context.  If the
+ * prefault task is preempted in the meantime, kvm_vcpu_put_hw_mmu()
+ * keeps the canonical MMU in place for the reload.  Leaving the swap in
+ * place also bounds the cost to at most one put/load pair per ioctl,
+ * rather than two pairs per prefaulted page.
+ */
+static void kvm_pre_fault_load_canonical_mmu(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu_has_nv(vcpu) || vcpu->arch.hw_mmu == &vcpu->kvm->arch.mmu)
+		return;
+
+	preempt_disable();
+	kvm_arch_vcpu_put(vcpu);
+	vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
+	kvm_arch_vcpu_load(vcpu, smp_processor_id());
+	preempt_enable();
+}
+
+long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
+				    struct kvm_pre_fault_memory *range)
+{
+	struct kvm_vcpu_fault_info *fault_info = &vcpu->arch.fault;
+	struct kvm_vcpu_fault_info fault_backup = *fault_info;
+	s8 walk_level = KVM_PGTABLE_LAST_LEVEL;
+	unsigned long page_size = PAGE_SIZE;
+	struct kvm_memory_slot *memslot;
+	phys_addr_t gpa = range->gpa;
+	struct kvm_pgtable *pgt;
+	phys_addr_t end;
+	kvm_pte_t pte;
+	hva_t hva;
+	gfn_t gfn;
+	long ret;
+
+	if (vcpu_is_protected(vcpu))
+		return -EOPNOTSUPP;
+
+	/*
+	 * Interpret range->gpa in the userspace-owned VM's IPA space, not in
+	 * any nested guest IPA space that may have been active on the vCPU's
+	 * last run.  Always target the canonical stage-2.
+	 */
+	kvm_pre_fault_load_canonical_mmu(vcpu);
+
+	if (gpa >= kvm_phys_size(vcpu->arch.hw_mmu)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	gfn = gpa_to_gfn(gpa);
+	memslot = gfn_to_memslot(vcpu->kvm, gfn);
+	if (!memslot) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	/*
+	 * A racing memslot deletion or move installs an invalid slot before
+	 * zapping stage-2.  Ask userspace to retry once the update settles.
+	 */
+	if (memslot->flags & KVM_MEMSLOT_INVALID) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	/*
+	 * pKVM stage-2 mappings aren't directly walkable from the host; let
+	 * the fault path handle both new and existing mappings.
+	 */
+	if (!is_protected_kvm_enabled()) {
+		pgt = vcpu->arch.hw_mmu->pgt;
+		scoped_guard(read_lock, &vcpu->kvm->mmu_lock) {
+			ret = kvm_pgtable_get_leaf(pgt, gpa, &pte, &walk_level,
+						   KVM_PGTABLE_WALK_SHARED);
+		}
+		if (ret)
+			goto out;
+
+		if (kvm_pte_valid(pte)) {
+			page_size = kvm_granule_size(walk_level);
+			if (!(pte & KVM_PTE_LEAF_ATTR_LO_S2_AF))
+				handle_access_fault(vcpu, gpa);
+			goto out_success;
+		}
+	}
+
+	/*
+	 * Synthesize a read translation fault for the canonical IPA, at the
+	 * level where the stage-2 walk currently ends (the last level under
+	 * pKVM, where stage-2 isn't walkable from the host).
+	 */
+	fault_info->esr_el2 = (ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT) |
+		ESR_ELx_IL | ESR_ELx_FSC_FAULT_L(walk_level);
+	fault_info->hpfar_el2 = HPFAR_EL2_NS |
+		FIELD_PREP(HPFAR_EL2_FIPA, gpa >> 12);
+
+	struct kvm_s2_fault_desc s2fd = {
+		.vcpu		= vcpu,
+		.fault_ipa	= gpa,
+		.nested		= NULL,
+		.memslot	= memslot,
+		.page_size	= &page_size,
+		.prefault	= true,
+	};
+
+	/*
+	 * As in the run path, -EAGAIN from the abort handlers is treated as
+	 * progress: either a parallel fault installed the mapping, or a racing
+	 * invalidation is in flight and the next access will refault.
+	 */
+	if (kvm_slot_has_gmem(memslot)) {
+		ret = gmem_abort(&s2fd);
+	} else {
+		hva = gfn_to_hva_memslot_prot(memslot, gfn, NULL);
+		if (kvm_is_error_hva(hva)) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		s2fd.hva = hva;
+		ret = user_mem_abort(&s2fd);
+	}
+
+	if (ret < 0)
+		goto out;
+
+out_success:
+	end = ALIGN_DOWN(gpa, page_size) + page_size;
+	ret = min_t(u64, range->size, end - gpa);
+out:
+	/*
+	 * Restore the synthetic fault state so a subsequent KVM_RUN does not
+	 * observe it. kvm_handle_mmio_return() runs before guest entry can
+	 * refresh fault.esr_el2 from hardware, so leaving the synthetic ESR
+	 * in place would corrupt the completion of a pending MMIO exit.
+	 */
+	*fault_info = fault_backup;
+	return ret;
+}
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 1/5] KVM: arm64: Pass walk flags to kvm_pgtable_get_leaf()
From: Jack Thomson @ 2026-06-12 16:23 UTC (permalink / raw)
  To: maz, oupton, pbonzini
  Cc: joey.gouly, seiden, suzuki.poulose, yuzenghui, catalin.marinas,
	will, shuah, corbet, vladimir.murzin, linux-arm-kernel, kvmarm,
	kvm, linux-kernel, linux-kselftest, linux-doc, isaku.yamahata,
	Jack Thomson
In-Reply-To: <20260612162354.73378-1-jackabt.amazon@gmail.com>

From: Jack Thomson <jackabt@amazon.com>

Allow callers of kvm_pgtable_get_leaf() to specify the page-table walk
flags, in preparation for performing walks under the MMU read lock.

Reading a stage-2 leaf while only holding the read lock requires
KVM_PGTABLE_WALK_SHARED: parallel faults (which also only hold the read
lock) can unlink table pages and free them via RCU, so the walker must
be inside an RCU read-side critical section, which the shared walk flag
provides via kvm_pgtable_walk_begin().

All existing callers either hold the write lock, walk with interrupts
disabled, or run at hyp where shared walks are rejected; they keep the
current behaviour by passing no flags.

No functional change intended.

Signed-off-by: Jack Thomson <jackabt@amazon.com>
---
 arch/arm64/include/asm/kvm_pgtable.h  |  5 ++++-
 arch/arm64/kvm/hyp/nvhe/mem_protect.c | 10 +++++-----
 arch/arm64/kvm/hyp/pgtable.c          |  5 +++--
 arch/arm64/kvm/mmu.c                  |  2 +-
 arch/arm64/kvm/nested.c               |  2 +-
 5 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 41a8687938eb..d0167f7dfbee 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -859,6 +859,8 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
  * @addr:	Input address for the start of the walk.
  * @ptep:	Pointer to storage for the retrieved PTE.
  * @level:	Pointer to storage for the level of the retrieved PTE.
+ * @flags:	Flags to control the page-table walk
+ *		(see struct kvm_pgtable_visit_ctx).
  *
  * The offset of @addr within a page is ignored.
  *
@@ -869,7 +871,8 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
  * Return: 0 on success, negative error code on failure.
  */
 int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
-			 kvm_pte_t *ptep, s8 *level);
+			 kvm_pte_t *ptep, s8 *level,
+			 enum kvm_pgtable_walk_flags flags);
 
 /**
  * kvm_pgtable_stage2_pte_prot() - Retrieve the protection attributes of a
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 25f04629014e..3b765c9ff7e8 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -522,7 +522,7 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
 	int ret;
 
 	hyp_assert_lock_held(&host_mmu.lock);
-	ret = kvm_pgtable_get_leaf(&host_mmu.pgt, addr, &pte, &level);
+	ret = kvm_pgtable_get_leaf(&host_mmu.pgt, addr, &pte, &level, 0);
 	if (ret)
 		return ret;
 
@@ -890,7 +890,7 @@ static int get_valid_guest_pte(struct pkvm_hyp_vm *vm, u64 ipa, kvm_pte_t *ptep,
 	s8 level;
 	int ret;
 
-	ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level);
+	ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level, 0);
 	if (ret)
 		return ret;
 	if (guest_pte_is_poisoned(pte))
@@ -939,7 +939,7 @@ int __pkvm_vcpu_in_poison_fault(struct pkvm_hyp_vcpu *hyp_vcpu)
 	ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(&hyp_vcpu->vcpu));
 
 	guest_lock_component(vm);
-	ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level);
+	ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level, 0);
 	if (ret)
 		goto unlock;
 
@@ -1293,7 +1293,7 @@ static int host_stage2_get_guest_info(phys_addr_t phys, struct pkvm_hyp_vm **vm,
 		return -EPERM;
 	}
 
-	ret = kvm_pgtable_get_leaf(&host_mmu.pgt, phys, &pte, &level);
+	ret = kvm_pgtable_get_leaf(&host_mmu.pgt, phys, &pte, &level, 0);
 	if (ret)
 		return ret;
 
@@ -1522,7 +1522,7 @@ static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ip
 	s8 level;
 	int ret;
 
-	ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level);
+	ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level, 0);
 	if (ret)
 		return ret;
 	if (!kvm_pte_valid(pte))
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 0c1defa5fb0f..6a839a32e246 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -298,12 +298,13 @@ static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx,
 }
 
 int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
-			 kvm_pte_t *ptep, s8 *level)
+			 kvm_pte_t *ptep, s8 *level,
+			 enum kvm_pgtable_walk_flags flags)
 {
 	struct leaf_walk_data data;
 	struct kvm_pgtable_walker walker = {
 		.cb	= leaf_walker,
-		.flags	= KVM_PGTABLE_WALK_LEAF,
+		.flags	= flags | KVM_PGTABLE_WALK_LEAF,
 		.arg	= &data,
 	};
 	int ret;
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 4da9281312eb..c720f07cb82e 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -839,7 +839,7 @@ static int get_user_mapping_size(struct kvm *kvm, u64 addr)
 	 * IPI-ing threads).
 	 */
 	local_irq_save(flags);
-	ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level);
+	ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level, 0);
 	local_irq_restore(flags);
 
 	if (ret)
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 38f672e94087..e45aed6d9e65 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -559,7 +559,7 @@ static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr)
 		return 0;
 
 	tmp &= ~(sz - 1);
-	if (kvm_pgtable_get_leaf(mmu->pgt, tmp, &pte, NULL))
+	if (kvm_pgtable_get_leaf(mmu->pgt, tmp, &pte, NULL, 0))
 		goto again;
 	if (!(pte & PTE_VALID))
 		goto again;
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 0/5] KVM: arm64: Add KVM_PRE_FAULT_MEMORY support
From: Jack Thomson @ 2026-06-12 16:23 UTC (permalink / raw)
  To: maz, oupton, pbonzini
  Cc: joey.gouly, seiden, suzuki.poulose, yuzenghui, catalin.marinas,
	will, shuah, corbet, vladimir.murzin, linux-arm-kernel, kvmarm,
	kvm, linux-kernel, linux-kselftest, linux-doc, isaku.yamahata,
	Jack Thomson

From: Jack Thomson <jackabt@amazon.com>

Hi,

This series adds arm64 support for KVM_PRE_FAULT_MEMORY, which was added
for x86 in [1]. The ioctl allows userspace to populate stage-2 mappings
before running a vCPU, reducing the number of stage-2 faults taken in
the run path. This is useful for post-copy migration, where stage-2
fault latency shows up directly in memory-intensive workloads.

On arm64, the GPA supplied to the ioctl is treated as an IPA in the
userspace-owned VM's memslot address space. If the vCPU most recently
ran a nested guest, KVM still targets the VM's canonical stage-2. It
does not interpret the GPA as an L2 IPA, and does not try to populate
the nested/shadow stage-2 selected by the vCPU's last run state.

The patches are:

 - Allow callers of kvm_pgtable_get_leaf() to pass walk flags, so the
   prefault path can walk stage-2 under the MMU read lock.

 - Add arm64 support for KVM_PRE_FAULT_MEMORY.

 - Enable pre_fault_memory_test on arm64.

 - Add a backing-source option to pre_fault_memory_test.

 - Add a nested (NV) selftest that prefaults on a vCPU whose last-run
   context is backed by a shadow stage-2 MMU with an empty nested
   stage-2 root.

The prefault flag and page_size output in the stage-2 fault descriptor
remain in this series so the arm64 implementation can advance by the
mapping granule installed by the fault path and report poison without
queueing a SIGBUS.

Tested with pre_fault_memory_test under an arm64 QEMU setup with
anonymous, shmem, anonymous_thp, anonymous_hugetlb and shared_hugetlb
backings, including 64K, 2M and 32M hugetlb pools, and with the new
nv_pre_fault_memory_test on an NV-capable setup.

=== Changes since v4 [2] ===

 - Reworked nested virt semantics: arm64 now treats the ioctl GPA as the
   VM/memslot IPA and always targets the canonical stage-2. It no longer
   translates an L2 IPA through L1's stage-2.

 - Documented the arm64 nested behavior in the KVM API text.

 - Switch to the canonical stage-2 with the vCPU put/load helpers when
   the vCPU last ran with a nested/shadow MMU, keeping VMID, VNCR and
   shadow-MMU refcount state consistent.

 - Split the kvm_pgtable_get_leaf() walk-flag plumbing into a prep patch
   and walk existing mappings with KVM_PGTABLE_WALK_SHARED under the MMU
   read lock.

 - Tightened prefault fault handling: preserve fault info, set IL in the
   synthetic ESR, handle existing mappings, return -EAGAIN for invalid
   memslot races, and report -EHWPOISON without queueing SIGBUS.

 - Avoid directly walking stage-2 page tables when pKVM is enabled.
   Protected VMs remain unsupported via -EOPNOTSUPP.

 - Preserve the selected selftest memory backing when recreating the
   racing memslot.

 - Add the nested (NV) prefault selftest, including an empty nested
   stage-2 root to catch accidental L2-IPA interpretation.

=== Changes since v3 [3] ===

 - Return -EOPNOTSUPP for protected VMs.

 - Reworked nested-vCPU handling to translate an L2 IPA through L1's
   stage-2. This has been superseded by the canonical VM-IPA semantics
   described above.

 - Make page_size unsigned and keep local declarations ordered at the
   top of kvm_arch_vcpu_pre_fault_memory().

=== Changes since v2 [4] ===

 - Update the synthetic fault info. Thanks Suzuki.

 - Remove the selftest change for unaligned mmap allocations. Thanks
   Sean.

[1]: https://lore.kernel.org/kvm/20240710174031.312055-1-pbonzini@redhat.com/
[2]: https://lore.kernel.org/linux-arm-kernel/20260113152643.18858-1-jackabt.amazon@gmail.com/
[3]: https://lore.kernel.org/linux-arm-kernel/20251119154910.97716-1-jackabt.amazon@gmail.com/
[4]: https://lore.kernel.org/linux-arm-kernel/20251013151502.6679-1-jackabt.amazon@gmail.com/

Jack Thomson (5):
  KVM: arm64: Pass walk flags to kvm_pgtable_get_leaf()
  KVM: arm64: Add pre_fault_memory implementation
  KVM: selftests: Enable pre_fault_memory_test for arm64
  KVM: selftests: Add option for different backing in pre-fault tests
  KVM: selftests: Add nested pre-fault test for arm64

 Documentation/virt/kvm/api.rst                |  18 +-
 arch/arm64/include/asm/kvm_pgtable.h          |   5 +-
 arch/arm64/kvm/Kconfig                        |   1 +
 arch/arm64/kvm/arm.c                          |   1 +
 arch/arm64/kvm/hyp/nvhe/mem_protect.c         |  10 +-
 arch/arm64/kvm/hyp/pgtable.c                  |   5 +-
 arch/arm64/kvm/mmu.c                          | 164 +++++++++++++-
 arch/arm64/kvm/nested.c                       |   2 +-
 tools/testing/selftests/kvm/Makefile.kvm      |   2 +
 .../kvm/arm64/nv_pre_fault_memory_test.c      | 200 ++++++++++++++++++
 .../selftests/kvm/pre_fault_memory_test.c     | 150 ++++++++++---
 11 files changed, 513 insertions(+), 45 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/arm64/nv_pre_fault_memory_test.c


base-commit: 98f826f3c500fda08d51fca434b7aefa6a2f7076
-- 
2.43.0

^ permalink raw reply

* Re: [PATCH v2] docs/zh_CN: update admin-guide/index.rst translation
From: Yan Zhu @ 2026-06-12 16:11 UTC (permalink / raw)
  To: Alex Shi
  Cc: dzm91, alexs, corbet, frederic, gpiccoli, jani.nikula, kees,
	linux-doc, linux-kernel, longman, mchehab+huawei, si.yanteng,
	skhan, tony.luck
In-Reply-To: <b7606f72-eb55-41a7-b685-00164050e159@gmail.com>


Hi Alex:

On 6/8/2026 9:40 AM, Alex Shi wrote:
> Applied, Thanks!

Can you confirm whether this patch has been merged? I couldn’t find it 
in the repo: 
https://git.kernel.org/pub/scm/linux/kernel/git/alexs/linux.git/?h=docs-next

-- 
Thanks
Yan Zhu


^ permalink raw reply

* Re: [PATCH net-next v09 4/5] hinic3: Add ethtool rss ops
From: Simon Horman @ 2026-06-12 16:08 UTC (permalink / raw)
  To: gongfan1
  Cc: Simon Horman, wudi234, tengpeisen, netdev, davem, edumazet, kuba,
	pabeni, andrew+netdev, ioana.ciornei, mohsin.bashr, linux-kernel,
	linux-doc, luosifu, guoxin09, zhoushuai28, wulike1, shijing34,
	zhengjiezhen, maxime.chevallier
In-Reply-To: <7d1a4375fdf7c3e7a5a6162382cee4f48991d5da.1781062575.git.wudi234@huawei.com>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
Full review at: https://sashiko.dev
---
hinic3: Add ethtool rss ops

This commit implements ethtool callback functions for the hinic3 driver.
These callbacks allow users to utilize ethtool for detailed RSS parameters
configuration and monitoring.

> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_rss.c
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_rss.c

[ ... ]

> +int hinic3_set_channels(struct net_device *netdev,
> +			struct ethtool_channels *channels)
> +{
[ ... ]
> +	if (netif_running(netdev)) {
> +		q_params = nic_dev->q_params;
> +		q_params.num_qps = (u16)count;
> +		q_params.txqs_res = NULL;
> +		q_params.rxqs_res = NULL;
> +		q_params.irq_cfg = NULL;
> +
> +		err = hinic3_change_channel_settings(netdev, &q_params);
> +		if (err) {
> +			netdev_err(netdev, "Failed to change channel settings\n");
> +			return err;
> +		}
> +	} else {
> +		nic_dev->q_params.num_qps = (u16)count;
> +	}
> +
> +	err = hinic3_rss_update_num_qps_and_reprogram(netdev);
> +	if (err)
> +		return err;

[Severity: Medium]
If hinic3_rss_update_num_qps_and_reprogram() fails, the error code is
returned to the caller, but the queue changes made earlier by
hinic3_change_channel_settings() remain active. Is it expected that a failure
leaves the newly requested channel configuration running?

[ ... ]

> +static int hinic3_set_rss_rxfh(struct net_device *netdev,
> +			       const u32 *indir, u8 *key)
> +{
[ ... ]
> +	if (indir) {
> +		for (i = 0; i < L2NIC_RSS_INDIR_SIZE; i++)
> +			temp_indir[i] = (u16)indir[i];
> +
> +		err = hinic3_rss_set_indir_tbl(nic_dev->hwdev, temp_indir);
> +		if (err) {
> +			netdev_err(netdev, "Failed to set rss indir table\n");
> +			return err;
> +		}
> +
> +		memcpy(nic_dev->rss_indir, temp_indir, L2NIC_RSS_INDIR_SIZE);

[Severity: High]
Does this copy the entire indirection table? Since temp_indir is an array
of u16 elements, copying L2NIC_RSS_INDIR_SIZE bytes instead of
L2NIC_RSS_INDIR_SIZE * sizeof(u16) will only update the first half of the
nic_dev->rss_indir array.

[ ... ]

> +static int hinic3_rss_get_indir_tbl(struct hinic3_hwdev *hwdev,
> +				    u32 *indir_table)
> +{
[ ... ]
> +	err = hinic3_cmdq_detail_resp(hwdev, MGMT_MOD_L2NIC,
> +				      L2NIC_UCODE_CMD_GET_RSS_INDIR_TBL,
> +				      pair.in, pair.out, NULL);
> +	if (err) {
> +		dev_err(hwdev->dev, "Failed to get rss indir table\n");
> +		goto err_get_indir_tbl;
> +	}
> +
> +	indir_tbl = (__force __le16 *)pair.out->buf;
> +	for (i = 0; i < L2NIC_RSS_INDIR_SIZE; i++)
> +		indir_table[i] = le16_to_cpu(*(indir_tbl + i));

[Severity: High]
Should the response buffer words be swabbed using hinic3_cmdq_buf_swab32()
before reading, similar to how it is handled in the set operation? Also,
does the data start immediately at offset 0, or is there a 16-byte
reserved prefix like the one defined in struct l2nic_cmd_rss_set_indir_tbl
that needs to be skipped?

[ ... ]

> +static int hinic3_update_hash_func_type(struct net_device *netdev, u8 hfunc)
> +{
> +	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
> +	enum hinic3_rss_hash_type new_rss_hash_type;
> +	int err;
[ ... ]
> +	if (new_rss_hash_type == nic_dev->rss_hash_type)
> +		return 0;
> +
> +	err = hinic3_rss_set_hash_type(nic_dev->hwdev, nic_dev->rss_hash_type);

[Severity: Medium]
Will this push the old hash type to the hardware instead of the newly
requested one? It passes nic_dev->rss_hash_type rather than
new_rss_hash_type.

> +	if (err) {
> +		netdev_err(netdev, "Failed to set RSS hash type to HW\n");
> +		return err;
> +	}
> +
> +	nic_dev->rss_hash_type = new_rss_hash_type;
> +
> +	return 0;
> +}

^ permalink raw reply

* [PATCH 03/10] docs/zh_CN: add LSM/LoadPin Chinese translation
From: Yan Zhu @ 2026-06-12 16:03 UTC (permalink / raw)
  To: alexs, si.yanteng, corbet, mic
  Cc: dzm91, skhan, gnoack, zhuyan2015, linux-doc,
	linux-security-module
In-Reply-To: <cover.1781105672.git.zhuyan2015@qq.com>

Translate Documentation/admin-guide/LSM/LoadPin.rst into Chinese.

Update the translation through commit 0860b72d535f
("security/loadpin: Update the changing interface in the source code.")

Assisted-by: Claude:deepseek-4-pro
Signed-off-by: Yan Zhu <zhuyan2015@qq.com>
---
 .../zh_CN/admin-guide/LSM/LoadPin.rst         | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/LoadPin.rst

diff --git a/Documentation/translations/zh_CN/admin-guide/LSM/LoadPin.rst b/Documentation/translations/zh_CN/admin-guide/LSM/LoadPin.rst
new file mode 100644
index 000000000000..d75a06f515f6
--- /dev/null
+++ b/Documentation/translations/zh_CN/admin-guide/LSM/LoadPin.rst
@@ -0,0 +1,33 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../../disclaimer-zh_CN.rst
+
+:Original: Documentation/admin-guide/LSM/LoadPin.rst
+:翻译:
+ 朱岩 Yan Zhu <zhuyan2015@qq.com>
+
+
+=======
+LoadPin
+=======
+
+LoadPin 是一种 Linux 安全模块(LSM),保证所有内核加载的文件(模块、固件等)
+均来自同一文件系统,并且该文件系统应当以只读设备(如 dm-verity 或 CDROM)为后
+端存储。这使得在拥有经过验证和/或不可更改的文件系统的系统上,能够在不对每个文
+件单独签名的前提下,强制模块和固件的加载限制。
+
+该 LSM 在编译时通过 ``CONFIG_SECURITY_LOADPIN`` 进行选择,并且可以在启动时通
+过内核命令行参数 ``loadpin.enforce`` 进行控制。默认情况下该功能是启用的,亦可
+在启动时通过 ``loadpin.enforce=0`` 将其关闭。
+
+LoadPin 会在检测到首个文件被加载时开始生效。如果承载该文件系统的块设备不是只
+读的,系统会创建一个 sysctl 条目 ``/proc/sys/kernel/loadpin/enabled`` 用于切
+换锁定功能。可写的文件系统意味着锁定功能亦可被修改,但通过 sysctl 可以在可
+写文件系统上便捷地进行测试。
+
+也可以使用内核命令行参数 ``loadpin.exclude`` 将特定文件类型从 LoadPin 中排除。
+默认情况下所有文件均被包含,但可以通过
+``loadpin.exclude=kernel-module,kexec-image`` 等方式排除相应类型。这允许在使
+用 ``CONFIG_MODULE_SIG``、``CONFIG_KEXEC_VERIFY_SIG`` 等机制对内核模块和内核
+镜像进行校验的同时,仍然利用 LoadPin 保护其他内核加载文件的完整性。有效文件类
+型的完整列表可在 ``include/linux/kernel_read_file.h`` 中的
+``kernel_read_file_str`` 找到。
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH net-next v09 3/5] hinic3: Add ethtool coalesce ops
From: Simon Horman @ 2026-06-12 16:08 UTC (permalink / raw)
  To: gongfan1
  Cc: Simon Horman, wudi234, tengpeisen, netdev, davem, edumazet, kuba,
	pabeni, andrew+netdev, ioana.ciornei, mohsin.bashr, linux-kernel,
	linux-doc, luosifu, guoxin09, zhoushuai28, wulike1, shijing34,
	zhengjiezhen, maxime.chevallier
In-Reply-To: <dda2f285e7314fa4a682a031328766e58f3137dc.1781062575.git.wudi234@huawei.com>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
Full review at: https://sashiko.dev
---
hinic3: Add ethtool coalesce ops

This patch implements the .get_coalesce and .set_coalesce ethtool callbacks
for the hinic3 driver. It allows users to configure and monitor RX coalesce
parameters.

> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
[ ... ]
> @@ -975,9 +980,247 @@ static void hinic3_get_pause_stats(struct net_device *netdev,
> +static int hinic3_set_queue_coalesce(struct net_device *netdev, u16 q_id,
> +				     struct hinic3_intr_coal_info *coal,
> +				     struct netlink_ext_ack *extack)
> +{
> +	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
> +	struct hinic3_intr_coal_info *intr_coal;
> +	struct hinic3_interrupt_info info = {};
> +	int err;
> +
> +	if (nic_dev->adaptive_rx_coal) {
> +		NL_SET_ERR_MSG_MOD(extack,
> +				   "Static coalesce not allowed in adaptive RX mode");
> +		return -EINVAL;
> +	}

[Severity: High]
When adaptive RX coalesce is enabled via ethtool, hinic3_set_coalesce()
sets nic_dev->adaptive_rx_coal to 1 before calling this function.

Will this check then unconditionally return -EINVAL, failing the operation
while leaving the software state permanently incorrectly recording that
adaptive mode is on?

> +
> +	if (!test_bit(HINIC3_INTF_UP, &nic_dev->flags) ||
> +	    q_id >= nic_dev->q_params.num_qps)
> +		return 0;
> +
> +	spin_lock(&nic_dev->coal_lock);
> +
> +	intr_coal = &nic_dev->intr_coalesce[q_id];

[Severity: Medium]
If the interface is down, this early return skips updating the software
configuration struct.

Will this cause the new coalesce parameters to be silently lost instead
of being applied when the interface is later brought up?

> +
> +	intr_coal->coalesce_timer_cfg = coal->coalesce_timer_cfg;
> +	intr_coal->pending_limit = coal->pending_limit;
> +	intr_coal->rx_pending_limit_low = coal->rx_pending_limit_low;
> +	intr_coal->rx_pending_limit_high = coal->rx_pending_limit_high;
> +	spin_unlock(&nic_dev->coal_lock);
> +
> +	info.msix_index = nic_dev->q_params.irq_cfg[q_id].msix_entry_idx;
> +	info.interrupt_coalesc_set = 1;
> +	info.coalesc_timer_cfg = intr_coal->coalesce_timer_cfg;
> +	info.pending_limit = intr_coal->pending_limit;
> +	info.resend_timer_cfg = intr_coal->resend_timer_cfg;
> +	err = hinic3_set_interrupt_cfg(nic_dev->hwdev, info);

[Severity: High]
Does hinic3_set_interrupt_cfg() need to be protected by change_res_mutex
here?

The DIM code path in hinic3_set_interrupt_moder() holds change_res_mutex
when performing this same hardware configuration. Since ethtool commands
and the DIM workqueue can run concurrently, could this lead to a race
condition corrupting the read-modify-write hardware state?

> +	if (err) {
> +		NL_SET_ERR_MSG_FMT_MOD(extack,
> +				       "Failed to set queue%u coalesce",
> +				       q_id);
> +		return err;
> +	}

[Severity: Medium]
If hinic3_set_interrupt_cfg() fails, the software state in intr_coal
has already been updated.

Because hinic3_set_hw_coal_param() runs this in a loop over all queues,
will a hardware configuration failure leave the software state permanently
diverging from the actual physical hardware state?

[ ... ]
> +static int hinic3_get_coalesce(struct net_device *netdev,
> +			       struct ethtool_coalesce *coal,
> +			       struct kernel_ethtool_coalesce *kernel_coal,
> +			       struct netlink_ext_ack *extack)
> +{
> +	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
> +	struct hinic3_intr_coal_info *interrupt_info;
> +
> +	memset(coal, 0, sizeof(*coal));

[Severity: Medium]
Is it safe to memset the entire ethtool_coalesce structure here?

The core ethtool ioctl code pre-initializes the cmd field to
ETHTOOL_GCOALESCE, so zeroing the entire struct will wipe the cmd field
and break the ethtool UAPI.

> +
> +	interrupt_info = &nic_dev->intr_coalesce[0];
> +
> +	coal->use_adaptive_rx_coalesce = nic_dev->adaptive_rx_coal;
> +
> +	if (nic_dev->adaptive_rx_coal) {
> +		coal->rx_max_coalesced_frames_low =
> +			interrupt_info->rx_pending_limit_low *
> +			COALESCE_PENDING_LIMIT_UNIT;
> +		coal->rx_max_coalesced_frames_high =
> +			interrupt_info->rx_pending_limit_high *
> +			COALESCE_PENDING_LIMIT_UNIT;
> +	} else {
> +		/* TX/RX uses the same interrupt.
> +		 * So we only declare RX ethtool_coalesce parameters.
> +		 */
> +		coal->rx_coalesce_usecs = interrupt_info->coalesce_timer_cfg *
> +					  COALESCE_TIMER_CFG_UNIT;
> +		coal->rx_max_coalesced_frames = interrupt_info->pending_limit *
> +						COALESCE_PENDING_LIMIT_UNIT;
> +	}

[Severity: High]
When adaptive mode is active, the static coalesce fields are left as 0,
and vice versa.

During an ethtool read-modify-write cycle, will the intermediate 0 values
be passed back to hinic3_set_coalesce() and unconditionally overwrite
the driver state, wiping out the inactive configuration?

[ ... ]
> +static int hinic3_set_coalesce(struct net_device *netdev,
> +			       struct ethtool_coalesce *coal,
> +			       struct kernel_ethtool_coalesce *kernel_coal,
> +			       struct netlink_ext_ack *extack)
> +{
[ ... ]
> +	intr_coal.coalesce_timer_cfg =
> +		(u8)(coal->rx_coalesce_usecs / COALESCE_TIMER_CFG_UNIT);
> +	intr_coal.pending_limit = (u8)(coal->rx_max_coalesced_frames /
> +				      COALESCE_PENDING_LIMIT_UNIT);
> +
> +	nic_dev->adaptive_rx_coal = coal->use_adaptive_rx_coalesce;

[Severity: High]
When disabling adaptive RX coalesce, is it necessary to cancel any
pending DIM work items before programming the hardware?

If a rxq->dim.work item is already executing concurrently or scheduled,
could it fire after the ethtool command completes and overwrite these newly
configured static limits with stale dynamically calculated values?

> +
> +	intr_coal.rx_pending_limit_high =
> +		(u8)(coal->rx_max_coalesced_frames_high /
> +		     COALESCE_PENDING_LIMIT_UNIT);
> +
> +	intr_coal.rx_pending_limit_low =
> +		(u8)(coal->rx_max_coalesced_frames_low /
> +		     COALESCE_PENDING_LIMIT_UNIT);
> +
> +	/* coalesce timer or pending set to zero will disable coalesce */
> +	if (!nic_dev->adaptive_rx_coal &&
> +	    (!intr_coal.coalesce_timer_cfg || !intr_coal.pending_limit))
> +		NL_SET_ERR_MSG_MOD(extack, "Coalesce will be disabled");
> +
> +	return hinic3_set_hw_coal_param(netdev, &intr_coal, extack);
> +}

^ permalink raw reply

* Re: [PATCH net-next v09 2/5] hinic3: Add ethtool statistic ops
From: Simon Horman @ 2026-06-12 16:07 UTC (permalink / raw)
  To: gongfan1
  Cc: Simon Horman, wudi234, tengpeisen, netdev, davem, edumazet, kuba,
	pabeni, andrew+netdev, ioana.ciornei, mohsin.bashr, linux-kernel,
	linux-doc, luosifu, guoxin09, zhoushuai28, wulike1, shijing34,
	zhengjiezhen, maxime.chevallier
In-Reply-To: <d3a84fab341ec7060236a7b74709e48f622df98e.1781062575.git.wudi234@huawei.com>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
Full review at: https://sashiko.dev
---
Subject: hinic3: Add ethtool statistic ops

This commit adds PF/VF statistics functions for TX and RX processing in the
hinic3 driver. It implements various ethtool callbacks to allow detailed
netdev stats monitoring. It also adds a helper for management commands
requiring separate I/O buffers to retrieve expanded MAC stats.

diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
[ ... ]
> +static struct hinic3_stats hinic3_function_stats[] = {
> +	HINIC3_FUNC_STAT(tx_unicast_pkts_vport),
[ ... ]
> +static struct hinic3_stats hinic3_port_stats[] = {
> +	HINIC3_PORT_STAT(mac_tx_fragment_pkt_num),

[Severity: Medium]
Does this duplicate standard network counters into the private ethtool strings
interface?

The networking driver guidelines note that stats with a standard UAPI should
not be duplicated in the private strings.

The arrays for function and port stats export standard statistics like unicast
packets or MAC errors. Could these be exposed exclusively through standard
interfaces like rtnl_link_stats64 or ethtool_mac_stats instead?

[ ... ]
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c
[ ... ]
> @@ -588,5 +643,10 @@ int hinic3_rx_poll(struct hinic3_rxq *rxq, int budget)
>  	if (rxq->delta >= HINIC3_RX_BUFFER_WRITE)
>  		hinic3_rx_fill_buffers(rxq);
>  
> +	u64_stats_update_begin(&rxq->rxq_stats.syncp);
> +	rxq->rxq_stats.packets += (u64)nr_pkts;
> +	rxq->rxq_stats.bytes += rx_bytes;
> +	u64_stats_update_end(&rxq->rxq_stats.syncp);

[Severity: Medium]
Does this calculation undercount the number of physical packets received when
Large Receive Offload is active?

Earlier in the loop, the code accounts for aggregated wire packets by adding
the length of the headers for the aggregated packets to the bytes counter.

However, the rxq_stats.packets update only adds the number of host SKBs,
omitting the extra wire packets entirely.

[ ... ]
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_tx.c b/drivers/net/ethernet/huawei/hinic3/hinic3_tx.c
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_tx.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_tx.c
[ ... ]
> @@ -260,6 +267,9 @@ static int hinic3_tx_csum(struct hinic3_txq *txq, struct hinic3_sq_task *task,
>  		if (l4_proto != IPPROTO_UDP ||
>  		    ((struct udphdr *)skb_transport_header(skb))->dest !=
>  		    VXLAN_OFFLOAD_PORT_LE) {
> +			u64_stats_update_begin(&txq->txq_stats.syncp);
> +			txq->txq_stats.unknown_tunnel_pkt++;
> +			u64_stats_update_end(&txq->txq_stats.syncp);
>  			/* Unsupported tunnel packet, disable csum offload */
>  			skb_checksum_help(skb);
>  			return 0;

[Severity: High]
This is a pre-existing issue, but does ignoring the return value of the
software checksum fallback risk silent data corruption?

If the calculation in skb_checksum_help fails, for example due to an
allocation failure when ensuring the buffer is writable, the packet checksum
remains uncalculated. 

Since the error is ignored and 0 is returned, the driver proceeds to transmit
the packet without hardware offload, resulting in a corrupted packet being
sent onto the network.

[ ... ]
> +static void hinic3_get_pkt_stats(struct hinic3_txq *txq, struct sk_buff *skb)
> +{
> +	u32 hdr_len, tx_bytes;
> +	unsigned short pkts;
> +
> +	if (skb_is_gso(skb)) {
> +		hdr_len = (skb_shinfo(skb)->gso_segs - 1) *
> +			  skb_tcp_all_headers(skb);
> +		tx_bytes = skb->len + hdr_len;

[Severity: High]
Can this sequence cause an out-of-bounds read and statistics corruption for
UDP GSO packets?

The code assumes all GSO packets use TCP by calling a helper that reads the
TCP header length. If the driver handles UDP-encapsulated offloads, the
transport header is much shorter. 

Reading the data offset from a TCP header location accesses bytes past the end
of the UDP header. This could read arbitrary payload data as a header length
multiplier, corrupting the tx_bytes statistic and potentially causing a
page fault.

[ ... ]

^ permalink raw reply

* Re: [PATCH net-next v09 1/5] hinic3: Add ethtool queue ops
From: Simon Horman @ 2026-06-12 16:06 UTC (permalink / raw)
  To: gongfan1
  Cc: Simon Horman, wudi234, tengpeisen, netdev, davem, edumazet, kuba,
	pabeni, andrew+netdev, ioana.ciornei, mohsin.bashr, linux-kernel,
	linux-doc, luosifu, guoxin09, zhoushuai28, wulike1, shijing34,
	zhengjiezhen, maxime.chevallier
In-Reply-To: <02e87952a65aa268526ade2f03de6c76fbc1fe9d.1781062575.git.wudi234@huawei.com>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
Full review at: https://sashiko.dev
---
hinic3: Add ethtool queue ops

This commit implements ethtool callback functions .get_ringparam and
.set_ringparam. These callbacks allow users to utilize ethtool for detailed
queue depth configuration and monitoring.

> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
> index 90fc16288de95..be9992a235f7b 100644
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
[ ... ]
> @@ -409,6 +410,96 @@ hinic3_get_link_ksettings(struct net_device *netdev,
>  	return 0;
>  }
>  
> +static void hinic3_get_ringparam(struct net_device *netdev,
> +				 struct ethtool_ringparam *ring,
> +				 struct kernel_ethtool_ringparam *kernel_ring,
> +				 struct netlink_ext_ack *extack)
> +{
> +	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
> +
> +	ring->rx_max_pending = HINIC3_MAX_RX_QUEUE_DEPTH;
> +	ring->tx_max_pending = HINIC3_MAX_TX_QUEUE_DEPTH;
> +	ring->rx_pending = nic_dev->q_params.rq_depth;
> +	ring->rx_pending = nic_dev->q_params.sq_depth;

[Severity: High]
Does this code accidentally assign rx_pending twice and omit tx_pending?
Querying the interface will return 0 for the Tx queue depth and an
incorrect value for the Rx queue depth, which also breaks partial
ring updates.

> +}
> +
[ ... ]
> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
> index e7d6c2033b451..bc4d879f9be4e 100644
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
> @@ -137,7 +137,8 @@ static int hinic3_set_interrupt_moder(struct net_device *netdev, u16 q_id,
>  	struct hinic3_interrupt_info info = {};
>  	int err;
>  
> -	if (q_id >= nic_dev->q_params.num_qps)
> +	if (q_id >= nic_dev->q_params.num_qps ||
> +	    !mutex_trylock(&nic_dev->change_res_mutex))
>  		return 0;

[Severity: High]
Is it safe to silently return 0 (success) without applying the interrupt
moderation if the mutex is contended? This appears to drop hardware
configuration updates without informing the caller.

>  
>  	info.interrupt_coalesc_set = 1;
[ ... ]
> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_main.c b/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
> index 0a888fe4c975b..c87624a5e5dcd 100644
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
[ ... ]
> @@ -315,6 +316,9 @@ static void hinic3_link_status_change(struct net_device *netdev,
>  {
>  	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
>  
> +	if (!mutex_trylock(&nic_dev->change_res_mutex))
> +		return;
> +

[Severity: High]
Can this usage of mutex_trylock cause physical link status events to be
discarded? If the lock is held (e.g., during an ethtool ring size change),
the OS link state could permanently diverge from the physical link state.

>  	if (link_status_up) {
>  		if (netif_carrier_ok(netdev))
>  			return;

[Severity: Critical]
Does this early return path leak the mutex? We acquired change_res_mutex
earlier, but it looks like we exit without unlocking it, which would
cause subsequent deadlocks.

> @@ -330,6 +334,8 @@ static void hinic3_link_status_change(struct net_device *netdev,
>  		netif_carrier_off(netdev);
>  		netdev_dbg(netdev, "Link is down\n");
>  	}
> +
> +	mutex_unlock(&nic_dev->change_res_mutex);
>  }

[Severity: Critical]
There is another early return just above this code:
        if (!netif_carrier_ok(netdev))
                return;
Similarly, does this early return also forget to call mutex_unlock()?

>  
>  static void hinic3_port_module_event_handler(struct net_device *netdev,
> diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c b/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c
> index da73811641a9d..047214cfc7532 100644
> --- a/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c
> +++ b/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c
[ ... ]
> @@ -428,6 +439,74 @@ static void hinic3_vport_down(struct net_device *netdev)
>  	}
>  }
>  
> +int
> +hinic3_change_channel_settings(struct net_device *netdev,
> +			       struct hinic3_dyna_txrxq_params *trxq_params)
> +{
[ ... ]
> +	err = hinic3_vport_up(netdev);
> +	if (err)
> +		goto err_close_channel;
> +
> +	hinic3_free_channel_resources(netdev, &cur_qp_params, &cur_trxq_params);
> +
> +	mutex_unlock(&nic_dev->change_res_mutex);
> +
> +	return 0;
> +
> +err_close_channel:
> +	hinic3_close_channel(netdev);
> +err_qp_irq_reset:
> +	nic_dev->q_params = cur_trxq_params;
> +
> +	if (trxq_params->num_qps > cur_trxq_params.num_qps)
> +		hinic3_qp_irq_change(netdev, cur_trxq_params.num_qps);
> +	hinic3_free_qp_ctxts(nic_dev);

[Severity: High]
Is hinic3_free_qp_ctxts() called twice sequentially in this error path?
The err_close_channel block calls hinic3_close_channel(), which internally
invokes hinic3_free_qp_ctxts(). The flow then falls through to
err_qp_irq_reset which explicitly calls hinic3_free_qp_ctxts() again.

> +err_uninit_qps:
> +	hinic3_get_cur_qps(nic_dev, &new_qp_params);
> +	hinic3_free_channel_resources(netdev, &new_qp_params, trxq_params);
> +	hinic3_free_channel_resources(netdev, &cur_qp_params, &cur_trxq_params);

[Severity: Critical]
Does this error path destroy both the new and the old working channel
resources? By calling hinic3_free_channel_resources() on cur_qp_params,
we leave dangling pointers in nic_dev->q_params and nic_io. It seems
subsequent packet transmissions would trigger a use-after-free, and
closing the interface later would trigger a double-free.

> +	mutex_unlock(&nic_dev->change_res_mutex);
> +
> +	return err;
> +}
> +
>  static int hinic3_open(struct net_device *netdev)
>  {
>  	struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
> @@ -458,6 +537,10 @@ static int hinic3_open(struct net_device *netdev)
>  
>  	hinic3_init_qps(nic_dev, &qp_params);
>  
> +	err = hinic3_prepare_channel(netdev, &nic_dev->q_params);
> +	if (err)
> +		goto err_uninit_qps;
> +
>  	err = hinic3_open_channel(netdev);
>  	if (err)
>  		goto err_uninit_qps;

[Severity: High]
Does jumping to err_uninit_qps leak the QP contexts allocated by
hinic3_prepare_channel()?

Since hinic3_prepare_channel() calls hinic3_init_qp_ctxts(), it looks like
we bypass the cleanup that happens inside hinic3_close_channel(), which
would permanently leak the root context configuration and associated DMA
memory.

^ permalink raw reply

* [PATCH 08/10] docs/zh_CN: add LSM/SafeSetID Chinese translation
From: Yan Zhu @ 2026-06-12 15:58 UTC (permalink / raw)
  To: alexs, si.yanteng, corbet, mic
  Cc: dzm91, skhan, gnoack, zhuyan2015, linux-doc,
	linux-security-module
In-Reply-To: <cover.1781105672.git.zhuyan2015@qq.com>

Translate Documentation/admin-guide/LSM/SafeSetID.rst into Chinese.

Update the translation through commit c34921670736
("Documentation: Fix admin-guide typos")

Assisted-by: Claude:deepseek-4-pro
Signed-off-by: Yan Zhu <zhuyan2015@qq.com>
---
 .../zh_CN/admin-guide/LSM/SafeSetID.rst       | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/SafeSetID.rst

diff --git a/Documentation/translations/zh_CN/admin-guide/LSM/SafeSetID.rst b/Documentation/translations/zh_CN/admin-guide/LSM/SafeSetID.rst
new file mode 100644
index 000000000000..3f96df3ed776
--- /dev/null
+++ b/Documentation/translations/zh_CN/admin-guide/LSM/SafeSetID.rst
@@ -0,0 +1,82 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../../disclaimer-zh_CN.rst
+
+:Original: Documentation/admin-guide/LSM/SafeSetID.rst
+:翻译:
+ 朱岩 Yan Zhu <zhuyan2015@qq.com>
+
+
+=========
+SafeSetID
+=========
+
+SafeSetID 是一个 LSM 模块,用于对 setid 系列系统调用进行门控,限制 UID/GID
+的转变只能在系统范围的白名单中批准的 UID/GID 之间进行。这些限制还禁止给定的
+UID/GID 获得与 ``CAP_SET{U/G}ID`` 关联的辅助特权,例如允许用户设置用户命名空
+间的 UID/GID 映射。
+
+背景
+====
+在缺少文件能力的情况下,需要切换到其他用户的进程必须具备 ``CAP_SETUID`` 权限。
+``CAP_SETUID`` 只授予以 root 身份运行的程序或显式获得 ``CAP_SETUID`` 运行时
+能力的非 root 程序。相较于文件能力,通常更推荐使用 Linux 运行时能力,因为使
+用文件能力以提升的权限运行程序会带来潜在的安全风险——任何拥有该文件访问权限的
+用户都可以通过 ``exec()`` 运行该程序来获得提升的特权。
+
+虽然可以通过为完整的 ``CAP_SET{U/G}ID`` 能力给进程树授予权限来实现,但这与在
+非 root 用户下运行进程树的目标相冲突。尤其 ``CAP_SETUID`` 允许切换到系统上任
+何用户,包括 root,这在很多场景中过于强大。实际中多数程序仅调用 ``setuid()``
+降低特权,而非提升特权。Linux 并未提供通用机制限制用户通过 ``setuid()`` 能切
+换到的 UID 范围,除非允许其切换到系统上任意用户。SafeSetID LSM 正是为了解决
+这一问题。
+
+主要使用场景是允许非 root 程序在不拥有完整 ``CAP_SETUID`` 能力的情况下,安全
+地切换到其他非受信任的 UID。该非 root 程序仍需 ``CAP_SETUID`` 才能执行任何转
+变,但SafeSetID 施加的额外限制,使其成为 ``CAP_SETUID`` 的“安全版”,防止其进
+行未授权操作(如切换到 UID 0 或创建/进入新的用户命名空间)。这为系统服务提供
+基于UID 的沙箱化提供了可能,而无需在大量非 root 程序上分配完整的
+``CAP_SETUID``。
+
+其他已考虑的方案
+================
+
+在用户空间解决此问题
+--------------------
+可以通过在用户空间完全移除 setid 能力并使用特权帮助程序来完成进程的 UID/GID
+转换。然而,这会影响大量与进程生成相关的语义,如 ``fork()`` 后不立即
+``exec()`` 的行为、父进程自定义环境变量或命令行参数、以及文件句柄跨
+``fork()/exec()`` 的继承等。因此,此类方案对依赖特定进程生成语义的现有项目
+支持度较低。
+
+使用用户命名空间
+----------------
+另一种思路是在独立的用户命名空间中运行进程树,并在该命名空间内授予 setid 能力。
+这样,进程可以在自己的命名空间内自由切换 UID/GID,但只能映射到系统范围白名单
+中的 UID/GID。遗憾的是,用户命名空间往往需要与其他命名空间配合使用,例如网络
+或 PID 命名空间,否则会导致失去 ``CAP_NET_ADMIN`` 等关键能力,限制了实际可用
+性。
+
+使用已有 LSM
+------------
+当前树中没有任何其他 LSM 能够对 setid 转换进行门控,也没有实现
+``security_task_fix_setuid`` 钩子。SELinux 对此钩子声明:
+"由于setuid仅影响当前进程,并且由于SELinux的权限控制不基于Linux标识属性。
+因此,SELinux不需要控制此操作。"
+
+使用方法
+========
+SafeSetID 在 ``securityfs`` 中通过写入 ``safesetid/uid_allowlist_policy``
+与 ``safesetid/gid_allowlist_policy`` 文件来配置策略。策略的格式为
+``<UID>:<UID>`` 或 ``<GID>:<GID>``(使用十进制数字),并以换行符结束,例如
+``123:456\n``。写入空字符串 ``""`` 可清空策略。为特定 UID/GID 配置策略后,将
+阻止该 UID/GID 获得 ``CAP_SET{U/G}ID`` 相关的辅助特权,例如设置用户命名空间
+UID/GID 映射。
+
+GID 策略与 ``setgroups()``
+==========================
+在 v5.9 中已加入对 ``CAP_SETGID`` 限制的支持,与之前对 ``CAP_SETUID`` 的处理
+相同。然而,为了兼容用户空间常见的沙箱化代码规范,目前允许具有 ``CAP_SETGID``
+限制的进程调用任意 ``setgroups()``。这意味着在这些 ``setgroups()`` 限制策略
+检查代码就位之前,**当前的 GID 策略并未提供任何有意义的安全保障**。
+``setgroups()`` 的限制将在未来版本中加入策略检查代码后真正生效,该代码将依赖
+于 v5.9 中加入的 GID 策略配置代码。
-- 
2.43.0


^ permalink raw reply related

* [PATCH 03/10] docs/zh_CN: add LSM/LoadPin Chinese translation
From: Yan Zhu @ 2026-06-12 15:58 UTC (permalink / raw)
  To: alexs, si.yanteng, corbet, mic
  Cc: dzm91, skhan, gnoack, zhuyan2015, linux-doc,
	linux-security-module
In-Reply-To: <cover.1781105672.git.zhuyan2015@qq.com>

Translate Documentation/admin-guide/LSM/LoadPin.rst into Chinese.

Update the translation through commit 0860b72d535f
("security/loadpin: Update the changing interface in the source code.")

Assisted-by: Claude:deepseek-4-pro
Signed-off-by: Yan Zhu <zhuyan2015@qq.com>
---
 .../zh_CN/admin-guide/LSM/LoadPin.rst         | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/LoadPin.rst

diff --git a/Documentation/translations/zh_CN/admin-guide/LSM/LoadPin.rst b/Documentation/translations/zh_CN/admin-guide/LSM/LoadPin.rst
new file mode 100644
index 000000000000..d75a06f515f6
--- /dev/null
+++ b/Documentation/translations/zh_CN/admin-guide/LSM/LoadPin.rst
@@ -0,0 +1,33 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../../disclaimer-zh_CN.rst
+
+:Original: Documentation/admin-guide/LSM/LoadPin.rst
+:翻译:
+ 朱岩 Yan Zhu <zhuyan2015@qq.com>
+
+
+=======
+LoadPin
+=======
+
+LoadPin 是一种 Linux 安全模块(LSM),保证所有内核加载的文件(模块、固件等)
+均来自同一文件系统,并且该文件系统应当以只读设备(如 dm-verity 或 CDROM)为后
+端存储。这使得在拥有经过验证和/或不可更改的文件系统的系统上,能够在不对每个文
+件单独签名的前提下,强制模块和固件的加载限制。
+
+该 LSM 在编译时通过 ``CONFIG_SECURITY_LOADPIN`` 进行选择,并且可以在启动时通
+过内核命令行参数 ``loadpin.enforce`` 进行控制。默认情况下该功能是启用的,亦可
+在启动时通过 ``loadpin.enforce=0`` 将其关闭。
+
+LoadPin 会在检测到首个文件被加载时开始生效。如果承载该文件系统的块设备不是只
+读的,系统会创建一个 sysctl 条目 ``/proc/sys/kernel/loadpin/enabled`` 用于切
+换锁定功能。可写的文件系统意味着锁定功能亦可被修改,但通过 sysctl 可以在可
+写文件系统上便捷地进行测试。
+
+也可以使用内核命令行参数 ``loadpin.exclude`` 将特定文件类型从 LoadPin 中排除。
+默认情况下所有文件均被包含,但可以通过
+``loadpin.exclude=kernel-module,kexec-image`` 等方式排除相应类型。这允许在使
+用 ``CONFIG_MODULE_SIG``、``CONFIG_KEXEC_VERIFY_SIG`` 等机制对内核模块和内核
+镜像进行校验的同时,仍然利用 LoadPin 保护其他内核加载文件的完整性。有效文件类
+型的完整列表可在 ``include/linux/kernel_read_file.h`` 中的
+``kernel_read_file_str`` 找到。
-- 
2.43.0


^ permalink raw reply related

* [PATCH 10/10] docs/zh_CN: add LSM/landlock Chinese translation
From: Yan Zhu @ 2026-06-12 15:58 UTC (permalink / raw)
  To: alexs, si.yanteng, corbet, mic
  Cc: dzm91, skhan, gnoack, zhuyan2015, linux-doc,
	linux-security-module
In-Reply-To: <cover.1781105672.git.zhuyan2015@qq.com>

Translate Documentation/admin-guide/LSM/landlock.rst into Chinese.

Update the translation through commit de4b09abf088
("landlock: Document audit blocker field format")

Assisted-by: Claude:deepseek-4-pro
Signed-off-by: Yan Zhu <zhuyan2015@qq.com>
---
 .../zh_CN/admin-guide/LSM/landlock.rst        | 169 ++++++++++++++++++
 1 file changed, 169 insertions(+)
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/landlock.rst

diff --git a/Documentation/translations/zh_CN/admin-guide/LSM/landlock.rst b/Documentation/translations/zh_CN/admin-guide/LSM/landlock.rst
new file mode 100644
index 000000000000..7c465ce1c774
--- /dev/null
+++ b/Documentation/translations/zh_CN/admin-guide/LSM/landlock.rst
@@ -0,0 +1,169 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. Copyright © 2025 Microsoft Corporation
+.. include:: ../../disclaimer-zh_CN.rst
+
+:Original: Documentation/admin-guide/LSM/landlock.rst
+:翻译:
+ 朱岩 Yan Zhu <zhuyan2015@qq.com>
+
+
+========================
+Landlock:系统范围的管理
+========================
+
+:Author: Mickaël Salaün
+:Date: 2026年1月
+
+Landlock 可以利用审计框架记录事件。
+
+用户空间文档位于:Documentation/userspace-api/landlock.rst。
+
+审计
+====
+
+默认情况下,若启用了 ``audit``,沙箱程序的被拒访问请求会被记录。此默认行为可
+通过 ``sys_landlock_restrict_self()`` 标志更改(参见
+Documentation/userspace-api/landlock.rst)。审计规则也可以屏蔽 Landlock 日志。
+Landlock 能生成两种审计记录类型。
+
+记录类型
+--------
+
+AUDIT_LANDLOCK_ACCESS
+    此记录标识对内核资源的拒绝访问请求。``domain`` 字段指示阻止该请求的
+    域 ID。``blockers`` 字段列出导致拒绝的原因(逗号分隔),随后字段标识
+    内核对象(类似 SELinux)。一次审计事件可能出现多个此类记录。
+
+    示例:同一事件中两条文件链接请求记录::
+
+        domain=195ba459b blockers=fs.refer path="/usr/bin" dev="vda2" ino=351
+        domain=195ba459b blockers=fs.make_reg,fs.refer path="/usr/local" dev="vda2" ino=365
+
+    ``blockers`` 字段使用点分前缀标识限制类型:
+
+    **fs.*** – 文件系统访问权(ABI 1+):
+        - fs.execute, fs.write_file, fs.read_file, fs.read_dir
+        - fs.remove_dir, fs.remove_file
+        - fs.make_char, fs.make_dir, fs.make_reg, fs.make_sock
+        - fs.make_fifo, fs.make_block, fs.make_sym
+        - fs.refer (ABI 2+)
+        - fs.truncate (ABI 3+)
+        - fs.ioctl_dev (ABI 5+)
+
+    **net.*** – 网络访问权(ABI 4+):
+        - net.bind_tcp – TCP 端口绑定被拒绝
+        - net.connect_tcp – TCP 连接被拒绝
+
+    **scope.*** – IPC 范围限制(ABI 6+):
+        - scope.abstract_unix_socket – 抽象 UNIX 套接字连接被拒绝
+        - scope.signal – 信号发送被拒绝
+
+    当多个访问权缺失时,可出现多个 ``blockers``(逗号分隔),例如缺少
+    ``make_reg`` 与 ``refer`` 权限时会显示 ``blockers=fs.make_reg,fs.refer``。
+
+    对象标识字段(路径、设备、inode 等)根据被阻止的访问类型提供上下文信息。
+
+AUDIT_LANDLOCK_DOMAIN
+    此记录描述 Landlock 域的状态。``status`` 字段为 ``allocated`` 或
+    ``deallocated``。
+
+    ``allocated`` 状态随同首条 ``AUDIT_LANDLOCK_ACCESS`` 记录出现,
+    提供以下信息:
+
+    - 域 ID(``domain``)
+    - 强制模式(``mode``)
+    - 创建域的 PID(``pid``)
+    - 创建域的 UID(``uid``)
+    - 创建域的可执行路径(``exe``)
+    - 创建域的命令行(``comm``)
+
+    示例::
+
+        domain=195ba459b status=allocated mode=enforcing pid=300 uid=0 ...
+
+    ``deallocated`` 为单独事件,表示域释放。此后该域 ID 不会在系统生命周期中
+    再次使用。``deallocated`` 记录包含域 ID 与 ``denials`` 字段,后者统计该
+    域被拒绝的请求数。
+
+    示例::
+
+        domain=195ba459b status=deallocated denials=3
+
+事件示例
+--------
+
+以下为两组审计日志示例(含序列号)。
+
+示例 1:沙箱程序 ``kill`` 试图向 init 进程发送信号,
+被 ``scope.signal`` 限制拒绝::
+
+  $ LL_FS_RO=/ LL_FS_RW=/ LL_SCOPED=s LL_FORCE_LOG=1 ./sandboxer kill 1
+
+此命令生成两个事件,每个事件都由一个时间戳后跟一个唯一的序列号标识
+(``msg=audit(1729738800.268:30)``)。第一个事件(序列号为“30”)包含 4 条记
+录。第一条记录(``type=LANDLOCK_ACCESS``)显示域 `1a6fdc66f` 拒绝了访问。拒
+绝的原因是信号范围限制(``blockers=scope.signal``)。本应接收此信号的进程是
+init 进程(``opid=1 ocomm="systemd"``)。
+
+第二个记录(``type=LANDLOCK_DOMAIN``) 描述了(``status=allocated``)域
+`1a6fdc66f`。此域由进程 ``286`` 创建,该进程执行了由 root 用户启动的
+`/root/sandboxer` 程序。
+
+第三个记录(``type=SYSCALL``)描述了系统调用、其提供的参数、其结果
+``success=no exit=-1`` 以及调用它的进程。
+
+第四个记录(``type=PROCTITLE``)以十六进制值的形式显示命令名称。这可以通过执
+行 ``python -c 'print(bytes.fromhex("6B696C6C0031"))'`` 进行转换。
+
+最后一条记录(``type=LANDLOCK_DOMAIN``)也是第二个事件(序列号“31”)中的唯一
+一条记录。它并非与直接的用户空间操作相关,而是异步操作以释放与 Landlock 域相
+关的资源(``status=deallocated``)。这有助于了解后续的日志将不再涉及域
+`1a6fdc66f`。此记录还总结了该域拒绝的请求数量(``denials=1``),以及这些请求
+是否被记录。
+
+
+.. code-block::
+
+  type=LANDLOCK_ACCESS msg=audit(1729738800.268:30): domain=1a6fdc66f blockers=scope.signal opid=1 ocomm="systemd"
+  type=LANDLOCK_DOMAIN msg=audit(1729738800.268:30): domain=1a6fdc66f status=allocated mode=enforcing pid=286 uid=0 exe="/root/sandboxer" comm="sandboxer"
+  type=SYSCALL msg=audit(1729738800.268:30): arch=c000003e syscall=62 success=no exit=-1 [..] ppid=272 pid=286 auid=0 uid=0 gid=0 [...] comm="kill" [...]
+  type=PROCTITLE msg=audit(1729738800.268:30): proctitle=6B696C6C0031
+  type=LANDLOCK_DOMAIN msg=audit(1729738800.324:31): domain=1a6fdc66f status=deallocated denials=1
+
+
+示例 2:文件系统访问控制示例::
+
+    $ LL_FS_RO=/ LL_FS_RW=/tmp LL_FORCE_LOG=1 ./sandboxer sh -c "echo > /etc/passwd"
+
+相关的审计日志包含由同一个域 `1a6fdc679` 创建的 3 个不同事件(序列号 33、34
+和 35)的 8 条记录::
+
+  type=LANDLOCK_ACCESS msg=audit(1729738800.221:33): domain=1a6fdc679 blockers=fs.write_file path="/dev/tty" dev="devtmpfs" ino=9
+  type=LANDLOCK_DOMAIN msg=audit(1729738800.221:33): domain=1a6fdc679 status=allocated mode=enforcing pid=289 uid=0 exe="/root/sandboxer" comm="sandboxer"
+  type=SYSCALL msg=audit(1729738800.221:33): arch=c000003e syscall=257 success=no exit=-13 [...] ppid=272 pid=289 auid=0 uid=0 gid=0 [...] comm="sh" [...]
+  type=PROCTITLE msg=audit(1729738800.221:33): proctitle=7368002D63006563686F203E202F6574632F706173737764
+  type=LANDLOCK_ACCESS msg=audit(1729738800.221:34): domain=1a6fdc679 blockers=fs.write_file path="/etc/passwd" dev="vda2" ino=143821
+  type=SYSCALL msg=audit(1729738800.221:34): arch=c000003e syscall=257 success=no exit=-13 [...] ppid=272 pid=289 auid=0 uid=0 gid=0 [...] comm="sh" [...]
+  type=PROCTITLE msg=audit(1729738800.221:34): proctitle=7368002D63006563686F203E202F6574632F706173737764
+  type=LANDLOCK_DOMAIN msg=audit(1729738800.261:35): domain=1a6fdc679 status=deallocated denials=2
+
+
+过滤审计日志
+------------
+
+如果审计日志数量过多,可通过以下两种方式过滤噪声:
+
+- 使用 ``sys_landlock_restrict_self()`` 的标志对沙箱程序进行配置;
+- 或使用审计规则(参见 :manpage:`auditctl(8)`)过滤。
+
+补充文档
+--------
+
+* `Linux Audit Documentation`_
+* Documentation/userspace-api/landlock.rst
+* Documentation/security/landlock.rst
+* https://landlock.io
+
+.. 链接
+.. _Linux Audit Documentation:
+   https://github.com/linux-audit/audit-documentation/wiki
-- 
2.43.0


^ permalink raw reply related

* [PATCH 09/10] docs/zh_CN: add LSM/ipe Chinese translation
From: Yan Zhu @ 2026-06-12 15:58 UTC (permalink / raw)
  To: alexs, si.yanteng, corbet, mic
  Cc: dzm91, skhan, gnoack, zhuyan2015, linux-doc,
	linux-security-module
In-Reply-To: <cover.1781105672.git.zhuyan2015@qq.com>

Translate Documentation/admin-guide/LSM/ipe.rst into Chinese.

Update the translation through commit d7ba853c0e47
("ipe: Update documentation for script enforcement")

Assisted-by: Claude:deepseek-4-pro
Signed-off-by: Yan Zhu <zhuyan2015@qq.com>
---
 .../zh_CN/admin-guide/LSM/ipe.rst             | 723 ++++++++++++++++++
 1 file changed, 723 insertions(+)
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/ipe.rst

diff --git a/Documentation/translations/zh_CN/admin-guide/LSM/ipe.rst b/Documentation/translations/zh_CN/admin-guide/LSM/ipe.rst
new file mode 100644
index 000000000000..08af2d03d400
--- /dev/null
+++ b/Documentation/translations/zh_CN/admin-guide/LSM/ipe.rst
@@ -0,0 +1,723 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../../disclaimer-zh_CN.rst
+
+:Original: Documentation/admin-guide/LSM/ipe.rst
+:翻译:
+ 朱岩 Yan Zhu <zhuyan2015@qq.com>
+
+==================================
+Integrity Policy Enforcement (IPE)
+==================================
+
+.. NOTE::
+
+   本文档面向管理员、系统构建者或尝试使用 IPE 的个人。如需面向开发者的 IPE 文
+   档,请参阅 :doc:`设计文档 </security/ipe>`。
+
+概览
+----
+
+Integrity Policy Enforcement (IPE) 是一种 Linux 安全模块,采用与传统访问控制
+互补的方式。不同于依赖标签和路径进行决策的传统访问控制机制,IPE 关注系统组件
+固有的不可变安全属性。这些属性是系统组件的基本特征或特性,无法被更改,从而为
+安全决策提供一致且可靠的基础。
+
+具体而言,在 IPE 的语境中,系统组件主要指文件或这些文件所在的设备。但这只是个
+起点。系统组件的概念是灵活的,可以随着系统演进而扩展以涵盖新的元素。不可变属
+性包括文件的来源,它随时间保持不变且不可更改。例如,IPE 策略可以被设计为信任
+来自 initramfs 的文件。由于 initramfs 通常由引导加载程序验证,其中的文件被认
+为是可信的;因而 "文件来自initramfs" 成为 IPE 考量的不可变属性。
+
+不可变属性的概念还延伸到文件来源上启用的安全特性,例如 dm-verity 或
+fs-verity,它们提供了完整性与信任的附加层。例如,IPE 允许定义信任来自
+dm-verity 保护设备的文件的策略。dm-verity 通过提供可验证且不可变的内容状态来
+确保整个设备的完整性。类似地,fs-verity 提供文件系统级别的完整性检查,允许
+IPE 强制执行信任受 fs-verity 保护的文件的策略。这两项特性一旦建立便无法关闭,
+因此被视为不可变属性。这些示例展示了 IPE 如何利用不可变属性(如文件来源及其完
+整性保护机制)来进行访问控制决策。
+
+具体而言,IPE 策略通过依据策略中定义的参考值评估安全属性,来实施严格的访问控
+制。评估既可以基于安全属性的存在与否(如验证文件是否源自 initramfs),也可以
+评估不可变安全属性的内部状态。后者包括检查 dm-verity 保护设备的 roothash、判
+断 dm-verity 是否拥有有效签名、评估 fs-verity 保护文件的摘要、或判断
+fs-verity 是否拥有有效的内建签名。这种精细的策略执行方法实现了高度安全且可定
+制的系统防御机制,能够适应特定的安全需求和信任模型。
+
+要启用 IPE,请确保在 :menuselection:`Security --> Integrity Policy
+Enforcement (IPE)` 下的 ``CONFIG_SECURITY_IPE`` 配置选项已打开。
+
+使用场景
+--------
+
+IPE 最适用于固定功能设备:即用途明确定义且不应更改的设备(如数据中心的网络防
+火墙设备、IoT 设备等),其中所有软件和配置均由系统所有者构建和部署。
+
+IPE 在通用计算领域还有很长的路要走:Linux 社区整体倾向于采用去中心化的信任模
+型(即 web of trust),而 IPE 目前尚不支持此模型。相反,IPE 支持 PKI(公钥基
+础设施),通常指定一组受信任的实体来提供一定程度的绝对信任。
+
+此外,虽然当今大多数软件包已经过签名,但包内的文件(例如可执行文件)通常未签
+名。这使得在需要包管理器正常工作的系统中难以利用 IPE,除非对包管理器及其背后
+的生态系统进行重大更改。
+
+digest_cache LSM [#digest_cache_lsm]_ 是一个系统,当与 IPE 结合使用时,可
+用于启用和支持通用计算场景。
+
+已知限制
+--------
+
+IPE 无法验证匿名可执行内存的完整性,例如 gcc 闭包和 libffi (<3.4.2) 创建的跳
+转表,或 JIT 编译的代码。不幸的是,由于这些是动态生成的代码,IPE 无法确保其完
+整性从而形成信任基础。
+
+IPE 无法验证解释型语言的程序完整性,当这些脚本通过以解释器传递程序文件的方式
+被调用时。这是因为解释器执行这些文件的方式所致;脚本本身并不通过 IPE 的任一钩
+子被评估为可执行代码,它们只是作为文本文件被读取(与已编译的可执行文件相对)。
+然而,随着 ``AT_EXECVE_CHECK`` 标志的引入
+(:doc:`AT_EXECVE_CHECK </userspace-api/check_exec>`),解释器可以使用它来通知
+内核一个脚本文件即将被执行,并请求内核对其执行 LSM 安全检查。
+
+IPE 的 EXECUTE 操作执行策略在已编译可执行文件和解释型脚本之间有所不同:对于已
+编译可执行文件,执行检查在内核执行 ``execve()``、``execveat()``、``mmap()``
+和 ``mprotect()`` 系统调用加载可执行内容时自动触发。对于解释型脚本,执行检查
+需要解释器显式集成,使用带 ``AT_EXECVE_CHECK`` 标志的 ``execveat()``。与 IPE
+在执行过程中拦截的 exec 系统调用不同,这一机制需要解释器主动配合,除非加入信
+号调用,否则现有解释器不会自动获得支持。
+
+威胁模型
+--------
+
+IPE 专门针对内核初始引导后用户空间可执行代码被篡改的风险,包括通过
+``modprobe`` 或 ``insmod`` 从用户空间加载的内核模块。
+
+举例说明,设想一个场景:一个不受信任的二进制文件(可能是恶意的)连同各种必要
+的依赖项(包括加载器和 libc)一起被下载。IPE 在此场景中的主要功能是阻止此类二
+进制文件及其依赖项的执行。
+
+IPE 通过在执行前验证所有可执行代码的完整性和真实性来实现这一目标。它进行彻底
+的检查,确保代码的完整性完好无损,并且代码与既定策略中授权的参考值(摘要、签
+名等)匹配。如果二进制文件未通过此验证过程——无论是因其完整性受到破坏还是不符
+合授权条件——IPE 将拒绝其执行。此外,IPE 还会生成审计日志,可用于检测和分析因
+策略违规而导致的失败。
+
+篡改威胁场景包括各类行为者对可执行代码的修改或替换,包括:
+
+-  对硬件有物理访问权限的行为者
+-  对系统有本地网络访问权限的行为者
+-  对部署系统有访问权限的行为者
+-  处于外部控制下的受感染内部系统
+-  系统的恶意终端用户
+-  系统受感染的终端用户
+-  系统的远程(外部)入侵
+
+IPE 不缓解来自恶意但已获授权的开发者(持有签名证书),或其使用的受感染开发工
+具(即面向返回编程攻击)所带来的威胁。此外,IPE 在用户空间与内核空间之间划定
+了硬安全边界。因此,内核级别的漏洞利用被视为IPE 范围之外,应由其他机制来缓解。
+
+策略
+----
+
+IPE 策略是一种纯文本 [#devdoc]_ 格式,由多条分布在多行上的语句组成。策略顶部
+有一行必需的内容,用于指明策略名称和策略版本,例如::
+
+   policy_name=Ex_Policy policy_version=0.0.0
+
+策略名称唯一键值,以人类可读的名称标识该策略。它用于在 securityfs 下创建节点,
+以及唯一标识策略以部署新策略或更新现有策略。
+
+策略版本表示策略的当前版本(**非** 策略语法版本)。用于防止将策略回滚到可能不
+安全的旧版本。
+
+IPE 策略的下一部分是规则。规则由 ``key=value`` 对组成,称为属性。IPE 规则需
+要两个属性:``action`` 决定了 IPE 在匹配到该规则时的行为,以及 ``op`` 决定了
+该规则何时应该被评估。顺序很重要,规则必须以 ``op`` 开头,以 ``action`` 结尾。
+因此,最小规则是::
+
+   op=EXECUTE action=ALLOW
+
+此示例将允许任何执行。其他属性用于评估被评估文件的不可变安全属性。这些属性旨
+在描述内核中能够提供完整性验证度量的系统,使得 IPE 能够根据属性的取值来判断资
+源的可信度。
+
+规则从上到下进行评估。因此,任何撤销规则或拒绝规则应放在文件的较早位置,以确
+保这些规则在带有 ``action=ALLOW`` 的规则之前被评估。
+
+IPE 策略支持注释。字符 '#' 起到注释的作用,忽略 '#' 右侧直到换行符之前的所有
+字符。
+
+IPE 评估的默认行为也可以通过 ``DEFAULT`` 语句在策略中表达。可以在全局级别或
+按操作级别设置::
+
+   # 全局
+   DEFAULT action=ALLOW
+
+   # 按操作级别
+   DEFAULT op=EXECUTE action=ALLOW
+
+必须为 IPE 中所有已知操作设置默认值。如果您希望保留旧策略与新内核的兼容性(新
+内核可能引入新操作),可以设置 ``ALLOW`` 的全局默认值,然后在每个操作的基础上
+覆盖默认值(如上所示)。
+
+对于基于可配置策略的 LSM,在启动时实施可配置策略存在若干问题,围绕策略的读取
+和解析:
+
+1. 内核 **不应** 从用户空间读取文件,因此直接读取策略文件是被禁止的。
+2. 内核命令行有字符长度限制,一个内核模块不应为自身配置占用整个字符限制。
+3. 内核生态系统中存在各种引导加载程序,因此通过传递内存块的方式成本高昂,难以
+   维护。
+
+为此,IPE 通过 "boot policy" 的概念来解决这个问题。启动策略是一份编译进内核
+的最小策略。该策略旨在将系统引导至用户空间已设置好并可以接收命令的状态,届时
+可以通过 securityfs 部署更复杂的策略。启动策略可以通过
+``SECURITY_IPE_BOOT_POLICY`` 配置选项指定,该选项接受要应用的 IPE 策略纯文本
+版本的路径。此策略将被编译进内核。如果未指定,IPE 将保持禁用状态,直到通过
+securityfs 部署并激活策略。
+
+部署策略
+~~~~~~~~
+
+策略可以通过 securityfs 从用户空间部署。这些策略通过 PKCS#7 消息格式进行签名,
+以实施某种程度的策略授权(防止攻击者获得无限制的 root 权限并部署 "allow all"
+策略)。这些策略必须由可链至 ``SYSTEM_TRUSTED_KEYRING`` 的证书签名,或者如果
+分别启用了 ``CONFIG_IPE_POLICY_SIG_SECONDARY_KEYRING`` 和/或
+``CONFIG_IPE_POLICY_SIG_PLATFORM_KEYRING``,则可链至次级和/或平台密钥环。使
+用 openssl,可以通过以下命令对策略进行签名::
+
+   openssl smime -sign \
+      -in "$MY_POLICY" \
+      -signer "$MY_CERTIFICATE" \
+      -inkey "$MY_PRIVATE_KEY" \
+      -noattr \
+      -nodetach \
+      -nosmimecap \
+      -outform der \
+      -out "$MY_POLICY.p7b"
+
+部署策略通过 securityfs 的 ``new_policy`` 节点完成。要部署策略,只需将文件内
+容 cat 到 securityfs 节点::
+
+   cat "$MY_POLICY.p7b" > /sys/kernel/security/ipe/new_policy
+
+成功后,这将在 ``/sys/kernel/security/ipe/policies/`` 下创建一个子目录。该子
+目录将是所部署策略的 ``policy_name`` 字段,因此对于上述示例,目录将是
+``/sys/kernel/security/ipe/policies/Ex_Policy``。在该目录中,将包含七个文件:
+``pkcs7``、``policy``、``name``、``version``、``active``、``update`` 和
+``delete``。
+
+``pkcs7`` 文件为只读。读取它将返回提供给内核的原始 PKCS#7 数据,即策略的表示。
+如果正在读取的是启动策略,将返回 ``ENOENT``,因为它未经签名。
+
+``policy`` 文件为只读。读取它将返回 PKCS#7 的内部内容,即纯文本策略。
+
+``active`` 文件用于将某个策略设置为当前活动策略。此文件可读写,接受值 ``"1"``
+以将策略设置为活动状态。由于同一时间只能有一个策略处于活动状态,所有其他策略
+将被标记为非活动。被标记为活动的策略必须具有大于或等于当前运行版本的策略版本。
+
+``update`` 文件用于更新内核中已存在的策略。此文件为只写,接受 PKCS#7 签名的
+策略。对此策略将始终执行两项检查:第一,``policy_names`` 必须在更新版本和现有
+版本之间匹配。第二,更新策略的版本必须大于当前运行版本的策略版本。这是为了防
+止回滚攻击。
+
+``delete`` 文件用于删除不再需要的策略。此文件为只写,接受值 ``1`` 以删除策略。
+删除时,代表该策略的 securityfs 节点将被移除。但是,不允许删除当前活动策略,
+将返回操作不允许的错误。
+
+类似地,写入 ``update`` 和 ``new_policy`` 可能导致错误消息(策略语法错误)或
+文件存在错误。后一种错误发生在尝试部署具有某 ``policy_name`` 的策略时,而内核
+中已部署了具有相同 ``policy_name`` 的策略。
+
+部署策略 **不会** 导致 IPE 开始强制执行该策略。IPE 只会强制执行标记为活动的
+策略。请注意,同一时间只能有一个策略处于活动状态。
+
+部署成功后,可以通过写入文件
+``/sys/kernel/security/ipe/policies/$policy_name/active`` 来激活策略。例如,
+可以通过以下命令激活 ``Ex_Policy``::
+
+   echo 1 > "/sys/kernel/security/ipe/policies/Ex_Policy/active"
+
+从此刻起,``Ex_Policy`` 成为系统上强制执行的策略。
+
+IPE 还提供了删除策略的方法。这可以通过 ``delete`` securityfs 节点
+``/sys/kernel/security/ipe/policies/$policy_name/delete`` 来完成。
+向该文件写入 ``1`` 以删除策略::
+
+   echo 1 > "/sys/kernel/security/ipe/policies/$policy_name/delete"
+
+删除策略只有一个要求:被删除的策略必须处于非活动状态。
+
+.. NOTE::
+
+   如果启用了传统 MAC 系统(SELinux、apparmor、smack),所有对 IPE
+   securityfs 节点的写入都需要 ``CAP_MAC_ADMIN``。
+
+模式
+~~~~
+
+IPE 支持两种操作模式:宽松模式(permissive,类似 SELinux 的 permissive 模式)
+和强制模式(enforced)。在宽松模式下,所有事件都会被检查,策略违规会被记录,
+但策略不会真正被强制执行。这允许用户在强制执行之前先测试策略。
+
+默认模式为强制模式(enforce),可以通过内核命令行参数 ``ipe.enforce=(0|1)``
+或 securityfs 节点 ``/sys/kernel/security/ipe/enforce`` 进行更改。
+
+.. NOTE::
+
+   如果启用了传统 MAC 系统(SELinux、apparmor、smack 等),所有对
+   IPE securityfs 节点的写入都需要 ``CAP_MAC_ADMIN``。
+
+审计事件
+~~~~~~~~
+
+1420 AUDIT_IPE_ACCESS
+^^^^^^^^^^^^^^^^^^^^^
+事件示例::
+
+   type=1420 audit(1653364370.067:61): ipe_op=EXECUTE ipe_hook=MMAP enforcing=1 pid=2241 comm="ld-linux.so" path="/deny/lib/libc.so.6" dev="sda2" ino=14549020 rule="DEFAULT action=DENY"
+   type=1300 audit(1653364370.067:61): SYSCALL arch=c000003e syscall=9 success=no exit=-13 a0=7f1105a28000 a1=195000 a2=5 a3=812 items=0 ppid=2219 pid=2241 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=2 comm="ld-linux.so" exe="/tmp/ipe-test/lib/ld-linux.so" subj=unconfined key=(null)
+   type=1327 audit(1653364370.067:61): 707974686F6E3300746573742F6D61696E2E7079002D6E00
+
+   type=1420 audit(1653364735.161:64): ipe_op=EXECUTE ipe_hook=MMAP enforcing=1 pid=2472 comm="mmap_test" path=? dev=? ino=? rule="DEFAULT action=DENY"
+   type=1300 audit(1653364735.161:64): SYSCALL arch=c000003e syscall=9 success=no exit=-13 a0=0 a1=1000 a2=4 a3=21 items=0 ppid=2219 pid=2472 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=2 comm="mmap_test" exe="/root/overlake_test/upstream_test/vol_fsverity/bin/mmap_test" subj=unconfined key=(null)
+   type=1327 audit(1653364735.161:64): 707974686F6E3300746573742F6D61696E2E7079002D6E00
+
+此事件表明 IPE 做出了访问控制决策;IPE 特有记录 (1420) 始终与
+``AUDITSYSCALL`` 记录一起发出。
+
+可以通过 ``AUDITSYSCALL`` 记录的 ``success`` 属性和退出码来判断 IPE 处于宽松
+模式还是强制模式。
+
+字段说明:
+
++-----------+----------+--------+--------------------------------------------+
+| 字段      | 值类型   | 可选?  | 值说明                                     |
++===========+==========+========+============================================+
+| ipe_op    | string   | 否     | 与日志关联的 IPE 操作名称                  |
++-----------+----------+--------+--------------------------------------------+
+| ipe_hook  | string   | 否     | 触发 IPE 事件的 LSM 钩子名称               |
++-----------+----------+--------+--------------------------------------------+
+| enforcing | integer  | 否     | 当前 IPE 强制执行状态,1:强制模式,0:宽松模式|
++-----------+----------+--------+--------------------------------------------+
+| pid       | integer  | 否     | 触发 IPE 事件的进程 PID                    |
++-----------+----------+--------+--------------------------------------------+
+| comm      | string   | 否     | 触发 IPE 事件的进程命令行程序名            |
++-----------+----------+--------+--------------------------------------------+
+| path      | string   | 是     | 被评估文件的绝对路径                       |
++-----------+----------+--------+--------------------------------------------+
+| ino       | integer  | 是     | 被评估文件的 inode 号                      |
++-----------+----------+--------+--------------------------------------------+
+| dev       | string   | 是     | 被评估文件的设备名,如 vda                  |
++-----------+----------+--------+--------------------------------------------+
+| rule      | string   | 否     | 匹配的策略规则                             |
++-----------+----------+--------+--------------------------------------------+
+
+
+1421 AUDIT_IPE_CONFIG_CHANGE
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+事件示例::
+
+   type=1421 audit(1653425583.136:54): old_active_pol_name="Allow_All" old_active_pol_version=0.0.0 old_policy_digest=sha256:E3B0C44298FC1C149AFBF4C8996FB92427AE41E4649B934CA495991B7852B855 new_active_pol_name="boot_verified" new_active_pol_version=0.0.0 new_policy_digest=sha256:820EEA5B40CA42B51F68962354BA083122A20BB846F26765076DD8EED7B8F4DB auid=4294967295 ses=4294967295 lsm=ipe res=1
+   type=1300 audit(1653425583.136:54): SYSCALL arch=c000003e syscall=1 success=yes exit=2 a0=3 a1=5596fcae1fb0 a2=2 a3=2 items=0 ppid=184 pid=229 auid=4294967295 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=4294967295 comm="python3" exe="/usr/bin/python3.10" key=(null)
+   type=1327 audit(1653425583.136:54): PROCTITLE proctitle=707974686F6E3300746573742F6D61696E2E7079002D66002E2
+
+此事件表明 IPE 将活动策略从一个切换到了另一个,同时记录了这两个策略的版本和哈
+希摘要。请注意,IPE 同一时间只能有一个活动策略,所有访问决策评估均基于当前活
+动策略。部署新策略的常规流程是先将策略加载到内核中,然后再将活动策略切换至它。
+
+此记录始终与 ``write`` 系统调用的 ``AUDITSYSCALL`` 记录一起发出。
+
+字段说明:
+
++------------------------+-----------+--------+----------------------------+
+| 字段                   | 值类型    | 可选? | 值说明                     |
++========================+===========+========+============================+
+| old_active_pol_name    | string    | 是     | 前一个活动策略的名         |
++------------------------+-----------+--------+----------------------------+
+| old_active_pol_version | string    | 是     | 前一个活动策略的版本       |
++------------------------+-----------+--------+----------------------------+
+| old_policy_digest      | string    | 是     | 前一个活动策略的哈希值     |
++------------------------+-----------+--------+----------------------------+
+| new_active_pol_name    | string    | 否     | 当前活动策略的名称         |
++------------------------+-----------+--------+----------------------------+
+| new_active_pol_version | string    | 否     | 当前活动策略的版本         |
++------------------------+-----------+--------+----------------------------+
+| new_policy_digest      | string    | 否     | 当前活动策略的哈希值       |
++------------------------+-----------+--------+----------------------------+
+| auid                   | integer   | 否     | 登录用户 ID                |
++------------------------+-----------+--------+----------------------------+
+| ses                    | integer   | 否     | 登录会话 ID                |
++------------------------+-----------+--------+----------------------------+
+| lsm                    | string    | 否     | 与该事件关联的 LSM 名称    |
++------------------------+-----------+--------+----------------------------+
+| res                    | integer   | 否     | 审计操作的结果(成功/失败)|
++------------------------+-----------+--------+----------------------------+
+
+1422 AUDIT_IPE_POLICY_LOAD
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+事件示例::
+
+   type=1422 audit(1653425529.927:53): policy_name="boot_verified" policy_version=0.0.0 policy_digest=sha256:820EEA5B40CA42B51F68962354BA083122A20BB846F26765076DD8EED7B8F4DB auid=4294967295 ses=4294967295 lsm=ipe res=1 errno=0
+   type=1300 audit(1653425529.927:53): arch=c000003e syscall=1 success=yes exit=2567 a0=3 a1=5596fcae1fb0 a2=a07 a3=2 items=0 ppid=184 pid=229 auid=4294967295 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=4294967295 comm="python3" exe="/usr/bin/python3.10" key=(null)
+   type=1327 audit(1653425529.927:53): PROCTITLE proctitle=707974686F6E3300746573742F6D61696E2E7079002D66002E2E
+
+此记录表明新策略已加载到内核中,包含策略名称、策略版本和策略哈希。
+
+此记录始终与 ``write`` 系统调用的 ``AUDITSYSCALL`` 记录一起发出。
+
+字段说明:
+
++----------------+-----------+--------+-----------------------------------+
+| 字段           | 值类型    | 可选? | 值说明                            |
++================+===========+========+===================================+
+| policy_name    | string    | 是     | 策略名称                          |
++----------------+-----------+--------+-----------------------------------+
+| policy_version | string    | 是     | 策略版本                          |
++----------------+-----------+--------+-----------------------------------+
+| policy_digest  | string    | 是     | 策略哈希                          |
++----------------+-----------+--------+-----------------------------------+
+| auid           | integer   | 否     | 登录用户 ID                       |
++----------------+-----------+--------+-----------------------------------+
+| ses            | integer   | 否     | 登录会话 ID                       |
++----------------+-----------+--------+-----------------------------------+
+| lsm            | string    | 否     | 与该事件关联的 LSM 名称           |
++----------------+-----------+--------+-----------------------------------+
+| res            | integer   | 否     | 审计操作的结果(成功/失败)       |
++----------------+-----------+--------+-----------------------------------+
+| errno          | integer   | 否     | 策略加载操作的错误码(见下表)    |
++----------------+-----------+--------+-----------------------------------+
+
+策略错误码 (errno):
+
+下表列出了在加载或更新策略时可能出现在 errno 字段中的错误码:
+
++----------------+------------------------------------------------------+
+| 错误码         | 说明                                                 |
++================+======================================================+
+| 0              | 成功                                                 |
++----------------+------------------------------------------------------+
+| -EPERM         | 权限不足                                             |
++----------------+------------------------------------------------------+
+| -EEXIST        | 同名策略已部署                                       |
++----------------+------------------------------------------------------+
+| -EBADMSG       | 策略无效                                             |
++----------------+------------------------------------------------------+
+| -ENOMEM        | 内存不足 (OOM)                                       |
++----------------+------------------------------------------------------+
+| -ERANGE        | 策略版本号溢出                                       |
++----------------+------------------------------------------------------+
+| -EINVAL        | 策略版本解析错误                                     |
++----------------+------------------------------------------------------+
+| -ENOKEY        | 签名 IPE 策略的密钥未在密钥环中找到                  |
++----------------+------------------------------------------------------+
+| -EKEYREJECTED  | 策略签名验证失败                                     |
++----------------+------------------------------------------------------+
+| -ESTALE        | 尝试使用旧版本更新 IPE 策略                          |
++----------------+------------------------------------------------------+
+| -ENOENT        | 策略在更新期间被删除                                 |
++----------------+------------------------------------------------------+
+
+1404 AUDIT_MAC_STATUS
+^^^^^^^^^^^^^^^^^^^^^
+
+事件示例::
+
+   type=1404 audit(1653425689.008:55): enforcing=0 old_enforcing=1 auid=4294967295 ses=4294967295 enabled=1 old-enabled=1 lsm=ipe res=1
+   type=1300 audit(1653425689.008:55): arch=c000003e syscall=1 success=yes exit=2 a0=1 a1=55c1065e5c60 a2=2 a3=0 items=0 ppid=405 pid=441 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=)
+   type=1327 audit(1653425689.008:55): proctitle="-bash"
+
+   type=1404 audit(1653425689.008:55): enforcing=1 old_enforcing=0 auid=4294967295 ses=4294967295 enabled=1 old-enabled=1 lsm=ipe res=1
+   type=1300 audit(1653425689.008:55): arch=c000003e syscall=1 success=yes exit=2 a0=1 a1=55c1065e5c60 a2=2 a3=0 items=0 ppid=405 pid=441 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=)
+   type=1327 audit(1653425689.008:55): proctitle="-bash"
+
+此记录始终与 ``write`` 系统调用的 ``AUDITSYSCALL`` 记录一起发出。
+
+字段说明:
+
++--------------+-----------+--------+-------------------------------------------------------+
+| 字段         | 值类型    | 可选? | 值说明                                                |
++==============+===========+========+=======================================================+
+| enforcing    | integer   | 否     | IPE 切换到的强制执行状态,1:强制模式,0:宽松模式        |
++--------------+-----------+--------+-------------------------------------------------------+
+| old_enforcing| integer   | 否     | IPE 切换前的强制执行状态,1:强制模式,0:宽松模式        |
++--------------+-----------+--------+-------------------------------------------------------+
+| auid         | integer   | 否     | 登录用户 ID                                           |
++--------------+-----------+--------+-------------------------------------------------------+
+| ses          | integer   | 否     | 登录会话 ID                                           |
++--------------+-----------+--------+-------------------------------------------------------+
+| enabled      | integer   | 否     | 新的 TTY 审计启用设置                                 |
++--------------+-----------+--------+-------------------------------------------------------+
+| old-enabled  | integer   | 否     | 旧的 TTY 审计启用设置                                 |
++--------------+-----------+--------+-------------------------------------------------------+
+| lsm          | string    | 否     | 与该事件关联的 LSM 名称                               |
++--------------+-----------+--------+-------------------------------------------------------+
+| res          | integer   | 否     | 审计操作的结果(成功/失败)                           |
++--------------+-----------+--------+-------------------------------------------------------+
+
+
+成功审计
+^^^^^^^^
+
+IPE 支持成功审计。启用后,所有通过 IPE 策略且未被阻止的事件都将生成审计事件。
+此功能默认关闭,可以通过内核命令行参数 ``ipe.success_audit=(0|1)`` 或
+securityfs 文件 ``/sys/kernel/security/ipe/success_audit`` 启用。
+
+这会产生 **大量** 日志,因为 IPE 会检查系统上的每一个用户空间二进制文件,但
+在调试策略时非常有用。
+
+.. NOTE::
+
+   如果启用了传统 MAC 系统(SELinux、apparmor、smack 等),所有对 IPE
+   securityfs 节点的写入都需要 ``CAP_MAC_ADMIN``。
+
+属性
+----
+
+如上所述,IPE 属性是 IPE 策略中以 ``key=value`` 对表示的表达式。策略解析器内
+置了两个属性:'op' 和 'action'。其他属性用于限定被评估文件的不可变安全属性。
+目前这些属性包括:'``boot_verified``'、'``dmverity_signature``'、
+'``dmverity_roothash``'、'``fsverity_signature``'、'``fsverity_digest``'。
+以下列出了 IPE 支持的所有属性的说明:
+
+op
+~~
+
+表示规则适用于的操作。必须作为第一条标记出现在每条规则中。IPE 支持
+以下操作:
+
+   ``EXECUTE``
+
+      适用于任何试图被执行或作为可执行文件加载的文件。
+
+   ``FIRMWARE``:
+
+      适用于通过 firmware_class 接口加载的固件。涵盖预分配缓冲区和固件文件本
+      身。
+
+   ``KMODULE``:
+
+      适用于通过 ``modprobe`` 或 ``insmod`` 加载的内核模块。
+
+   ``KEXEC_IMAGE``:
+
+      适用于通过 ``kexec`` 加载的内核镜像。
+
+   ``KEXEC_INITRAMFS``
+
+      适用于通过 ``kexec --initrd`` 加载的 initrd 镜像。
+
+   ``POLICY``:
+
+      通过内核空间发起的读取操作控制策略加载。
+
+      例如,将策略文件路径写入 ``$securityfs/ima/policy`` 来加载 IMA 策略。
+
+   ``X509_CERT``:
+
+      通过 Kconfig 选项 ``CONFIG_IMA_X509_PATH`` 和
+      ``CONFIG_EVM_X509_PATH`` 控制 IMA 证书的加载。
+
+action
+~~~~~~
+
+   决定 IPE 在规则匹配时的行为。必须作为最后一条子句出现在每条规则中。可以取
+   以下值:
+
+   ``ALLOW``:
+
+      如果规则匹配,显式允许访问资源并继续执行,不再评估后续规则。
+
+   ``DENY``:
+
+      如果规则匹配,显式禁止访问资源并停止执行,不再评估后续规则。
+
+boot_verified
+~~~~~~~~~~~~~
+
+   此属性可用于对来自 initramfs 的文件进行授权。
+   此属性的格式是::
+
+         boot_verified=(TRUE|FALSE)
+
+   .. WARNING::
+
+      此属性将信任来自 initramfs(rootfs) 的文件。它应仅在早期引导阶段使用。
+      在实际根文件系统挂载到 initramfs 之上之前,initramfs 脚本将递归删除
+      initramfs 上的所有文件和目录。这通常通过使用
+      switch_root(8) [#switch_root]_ 来实现。因此在实际根文件系统接管后,
+      initramfs 将为空且不可访问。建议在此之后切换到不依赖该属性的其他策略。
+      这样可以确保信任策略在整个系统运行期间保持相关和有效。
+
+dmverity_roothash
+~~~~~~~~~~~~~~~~~
+
+   此属性可用于授权或撤销特定的 dm-verity 卷,通过其根哈希进行标识。它依赖于
+   DM_VERITY 模块。此属性由 ``IPE_PROP_DM_VERITY`` 配置选项控制,当
+   ``SECURITY_IPE`` 和 ``DM_VERITY`` 都启用时它将自动被选中。
+   此属性的格式是::
+
+      dmverity_roothash=DigestName:HexadecimalString
+
+   dmverity_roothash 支持的 DigestName 有 [#dmveritydigests]_
+
+      + blake2b-512
+      + blake2s-256
+      + sha256
+      + sha384
+      + sha512
+      + sha3-224
+      + sha3-256
+      + sha3-384
+      + sha3-512
+      + sm3
+      + rmd160
+
+dmverity_signature
+~~~~~~~~~~~~~~~~~~
+
+   此属性可用于授权所有具有已签名 roothash 的 dm-verity 卷,该签名需由
+   dm-verity 配置中指定的密钥环(系统受信任密钥环或次级密钥环)验证有效。它依
+   赖于 ``DM_VERITY_VERIFY_ROOTHASH_SIG`` 配置选项,并由
+   ``IPE_PROP_DM_VERITY_SIGNATURE`` 配置选项控制,当 ``SECURITY_IPE``、
+   ``DM_VERITY`` 和 ``DM_VERITY_VERIFY_ROOTHASH_SIG`` 都启用时它将自动被选中。
+   此属性的格式是::
+
+      dmverity_signature=(TRUE|FALSE)
+
+fsverity_digest
+~~~~~~~~~~~~~~~
+
+   此属性可用于授权特定的启用了 fsverity 的文件,通过其 fsverity 摘要进行标
+   识。它依赖于 ``FS_VERITY`` 配置选项,并由 ``IPE_PROP_FS_VERITY`` 配置选项
+   控制,当 ``SECURITY_IPE`` 和 ``FS_VERITY`` 都启用时它将自动被选中。
+   此属性的格式是::
+
+      fsverity_digest=DigestName:HexadecimalString
+
+   fsverity_digest 支持的 DigestName 有 [#fsveritydigest]_
+
+      + sha256
+      + sha512
+
+fsverity_signature
+~~~~~~~~~~~~~~~~~~
+
+   此属性用于授权所有已通过 fs-verity 内建签名机制验证的启用了 fs-verity 的
+   文件。签名验证依赖于存储在 ".fs-verity" 密钥环中的密钥。它依赖于
+   ``FS_VERITY_BUILTIN_SIGNATURES`` 配置选项,并由 ``IPE_PROP_FS_VERITY``
+   配置选项控制,当 ``SECURITY_IPE``、``FS_VERITY`` 和
+   ``FS_VERITY_BUILTIN_SIGNATURES`` 都启用时它将自动被选中。
+   此属性的格式是::
+
+      fsverity_signature=(TRUE|FALSE)
+
+策略示例
+--------
+
+全部允许
+~~~~~~~~
+
+::
+
+   policy_name=Allow_All policy_version=0.0.0
+   DEFAULT action=ALLOW
+
+仅允许 initramfs
+~~~~~~~~~~~~~~~~
+
+::
+
+   policy_name=Allow_Initramfs policy_version=0.0.0
+   DEFAULT action=DENY
+
+   op=EXECUTE boot_verified=TRUE action=ALLOW
+
+允许任意已签名并验证的 dm-verity 卷和 initramfs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+   policy_name=Allow_Signed_DMV_And_Initramfs policy_version=0.0.0
+   DEFAULT action=DENY
+
+   op=EXECUTE boot_verified=TRUE action=ALLOW
+   op=EXECUTE dmverity_signature=TRUE action=ALLOW
+
+禁止从特定 dm-verity 卷执行
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+   policy_name=Deny_DMV_By_Roothash policy_version=0.0.0
+   DEFAULT action=DENY
+
+   op=EXECUTE dmverity_roothash=sha256:cd2c5bae7c6c579edaae4353049d58eb5f2e8be0244bf05345bc8e5ed257baff action=DENY
+
+   op=EXECUTE boot_verified=TRUE action=ALLOW
+   op=EXECUTE dmverity_signature=TRUE action=ALLOW
+
+仅允许特定 dm-verity 卷
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+   policy_name=Allow_DMV_By_Roothash policy_version=0.0.0
+   DEFAULT action=DENY
+
+   op=EXECUTE dmverity_roothash=sha256:401fcec5944823ae12f62726e8184407a5fa9599783f030dec146938 action=ALLOW
+
+允许任意带有有效内建签名的 fs-verity 文件
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+   policy_name=Allow_Signed_And_Validated_FSVerity policy_version=0.0.0
+   DEFAULT action=DENY
+
+   op=EXECUTE fsverity_signature=TRUE action=ALLOW
+
+允许执行特定 fs-verity 文件
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+   policy_name=ALLOW_FSV_By_Digest policy_version=0.0.0
+   DEFAULT action=DENY
+
+   op=EXECUTE fsverity_digest=sha256:fd88f2b8824e197f850bf4c5109bea5cf0ee38104f710843bb72da796ba5af9e action=ALLOW
+
+附加信息
+--------
+
+- `Github 仓库 <https://github.com/microsoft/ipe>`_
+- :doc:`IPE 开发者和设计文档 </security/ipe>`
+
+FAQ
+---
+
+Q:
+   IPE 与其他提供基于信任的访问控制的 LSM 有什么区别?
+
+A:
+
+   一般而言,另外有两种 LSM 可以提供类似的功能:IMA 和 Loadpin。
+
+   IMA 和 IPE 在功能上非常相似。两者之间的显著区别在于策略。 [#devdoc]_
+
+   Loadpin 与 IPE 差异相当大,因为 Loadpin 只覆盖了 IPE 的内核读取操作,而
+   IPE 除了内核读取外还能控制执行。信任模型也不同;Loadpin 的信任根植于初始
+   超级块,而 IPE 的信任来源于内核自身(通过 ``SYSTEM_TRUSTED_KEYS``)。
+
+------------
+
+.. [#digest_cache_lsm] https://lore.kernel.org/lkml/20240415142436.2545003-1-roberto.sassu@huaweicloud.com/
+
+.. [#devdoc] 关于此主题的更多信息,请参阅 :doc:`设计文档 </security/ipe>`。
+
+.. [#switch_root] https://man7.org/linux/man-pages/man8/switch_root.8.html
+
+.. [#dmveritydigests] 这些哈希算法基于 Linux 加密 API 接受的值;IPE 不限制摘
+                      要算法本身;因此,此列表可能不是最新的。
+
+.. [#fsveritydigest] 这些哈希算法基于内核 fsverity 支持接受的值;IPE 不限制
+                     摘要算法本身;因此,此列表可能不是最新的。
-- 
2.43.0


^ permalink raw reply related

* [PATCH 05/10] docs/zh_CN: add LSM/Smack Chinese translation
From: Yan Zhu @ 2026-06-12 15:58 UTC (permalink / raw)
  To: alexs, si.yanteng, corbet, mic
  Cc: dzm91, skhan, gnoack, zhuyan2015, linux-doc,
	linux-security-module
In-Reply-To: <cover.1781105672.git.zhuyan2015@qq.com>

Translate Documentation/admin-guide/LSM/Smack.rst into Chinese.

Update the translation through commit 674e2b24791c
("smack: fix bug: setting task label silently ignores input garbage")

Assisted-by: Claude:deepseek-4-pro
Signed-off-by: Yan Zhu <zhuyan2015@qq.com>
---
 .../zh_CN/admin-guide/LSM/Smack.rst           | 722 ++++++++++++++++++
 1 file changed, 722 insertions(+)
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/Smack.rst

diff --git a/Documentation/translations/zh_CN/admin-guide/LSM/Smack.rst b/Documentation/translations/zh_CN/admin-guide/LSM/Smack.rst
new file mode 100644
index 000000000000..cdc5221624a5
--- /dev/null
+++ b/Documentation/translations/zh_CN/admin-guide/LSM/Smack.rst
@@ -0,0 +1,722 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../../disclaimer-zh_CN.rst
+
+:Original: Documentation/admin-guide/LSM/Smack.rst
+:翻译:
+ 朱岩 Yan Zhu <zhuyan2015@qq.com>
+
+
+=====
+Smack
+=====
+
+"Good for you, you've decided to clean the elevator!"
+- The Elevator, from Dark Star
+
+Smack 是简化的强制访问控制 (MAC) 内核实现。
+它是一种基于内核的强制访问控制方案,设计目标强调
+简洁性。
+
+Smack 不是 Linux 唯一可用的 MAC 机制。首次接触 MAC
+的用户应将 Smack 与其他机制进行比较,以决定哪种最适合
+当前需求。
+
+Smack 由三大部分组成:
+
+- 内核
+- 基本工具(有帮助但非必需)
+- 配置数据
+
+内核部分实现为 Linux 安全模块 (LSM)。它依赖 netlabel,
+并在支持扩展属性的文件系统上表现最佳,但扩展属性支
+持并非严格要求。在标准发行版("vanilla")上运行 Smack
+内核是安全的。
+
+Smack 内核使用 CIPSO IP 选项。某些网络配置不接受 IP
+选项,可能会阻碍使用 Smack 的系统。
+
+Smack 已在 Tizen 操作系统中使用。了解 Smack 在 Tizen 中
+的使用方式,请访问 http://wiki.tizen.org 。
+
+当前 Smack 用户空间的 git 仓库为:
+
+    git://github.com/smack-team/smack.git
+
+大多数现代发行版都可以直接编译并安装。
+smackutil 中包含五个命令:
+
+chsmack:
+    显示或设置 Smack 扩展属性值
+
+smackctl:
+    加载 Smack 访问规则
+
+smackaccess:
+    检查具有某标签的进程是否能访问另一个标签的对象
+
+这两个命令已被以下接口取代:
+    smackfs/load2 和 smackfs/cipso2
+
+smackload:
+    为写入 smackfs/load 格式化数据
+
+smackcipso:
+    为写入 smackfs/cipso 格式化数据
+
+在 Smack 的理念下,配置数据极少且非强制要求。最关键
+的步骤是挂载 smackfs 伪文件系统。如果已安装 smackutil,
+启动脚本会自动完成;也可以手动完成。
+
+在 ``/etc/fstab`` 中加入如下行可实现自动挂载::
+
+    smackfs /sys/fs/smackfs smackfs defaults 0 0
+
+``/sys/fs/smackfs`` 目录由内核创建。
+
+Smack 使用扩展属性 (xattr) 在文件系统对象上存储标签,
+这些属性位于 security 命名空间。改变标签需要
+CAP_MAC_ADMIN 权限。
+
+Smack 使用的扩展属性如下:
+
+SMACK64
+    用于访问控制决策。新创建文件对象默认获得创建进程的标签。
+
+SMACK64EXEC
+    当带该属性的程序被 exec 时,进程将使用该属性值作为标签。
+
+SMACK64MMAP
+    若进程的 Smack 标签不允许某个该属性所含标签的进程被授予的全部访问
+    权限,则禁止对该文件进行 mmap。这是一个针对共享库的特定使用场景。
+
+SMACK64TRANSMUTE
+    仅能设置为 "TRUE"。若目录具有该属性,并且在目录中创建对象时,
+    允许写入该目录的 Smack 规则包含 transmute("t")模式,则对象将
+    继承目录标签而非创建进程标签。若创建的对象是目录,则
+    SMACK64TRANSMUTE 属性也会被一并设置。
+
+SMACK64IPIN
+    仅对套接字的文件描述符可用。使用该属性中的 Smack 标签对递送到
+    该套接字的数据包进行访问控制决策。
+
+SMACK64IPOUT
+    仅对套接字的文件描述符可用。使用该属性中的 Smack 标签对从该
+    套接字发出的数据包进行访问控制决策。
+
+设置 Smack 标签的方式示例::
+
+    # attr -S -s SMACK64 -V "value" path
+    # chsmack -a value path
+
+进程可通过读取 ``/proc/self/attr/current`` 查看自身标签,
+具备 ``CAP_MAC_ADMIN`` 的进程可写入该文件修改标签。
+
+多数 Smack 配置通过写入 smackfs 文件系统完成,该伪文件系统挂载在
+``/sys/fs/smackfs`` 上。
+
+access
+    为向后兼容保留;推荐使用 ``access2`` 接口。
+    此接口报告具有指定 Smack 标签的主体是否对具有指定 Smack 标签
+    的客体拥有特定访问权限。向该文件写入一条固定格式的访问规则,
+    下一次读取将指示访问是否被允许。内容为 "1" 表示允许访问,
+    "0" 表示拒绝。
+
+access2
+    此接口报告具有指定 Smack 标签的主体是否对具有指定 Smack 标签
+    的客体拥有特定访问权限。向该文件写入一条长格式的访问规则,
+    下一次读取将指示访问是否被允许。内容为 "1" 表示允许访问,
+    "0" 表示拒绝。
+
+ambient
+    包含对未标记网络数据包应用的 Smack 标签。
+
+change-rule
+    此接口允许修改已有的访问控制规则。写入格式为::
+
+        "%s %s %s %s"
+
+    其中第一个字符串为主体标签,第二个为客体标签,第三个为允许的
+    访问权限,第四个为拒绝的访问权限。访问字符串只能包含 "rwxat-"
+    字符。如果给定主体和客体的规则已存在,将通过启用第三个字符串中
+    的权限并禁用第四个字符串中的权限来修改该规则。如果不存在这样
+    的规则,将使用第三和第四个字符串中指定的访问权限创建新规则。
+
+cipso
+    为向后兼容保留;推荐使用 ``cipso2`` 接口。
+    此接口允许将特定的 CIPSO 标头分配给 Smack 标签。
+    写入格式为::
+
+        "%24s%4d%4d"["%4d"]...
+
+    第一个字符串是固定长度的 Smack 标签。第一个数字是要使用的级别。
+    第二个数字是类别数量。后续数字是类别::
+
+        "level-3-cats-5-19          3   2   5  19"
+
+cipso2
+    此接口允许将特定的 CIPSO 标头分配给 Smack 标签。
+    写入格式为::
+
+        "%s%4d%4d"["%4d"]...
+
+    第一个字符串是长 Smack 标签。第一个数字是要使用的级别。
+    第二个数字是类别数量。后续数字是类别::
+
+        "level-3-cats-5-19   3   2   5  19"
+
+direct
+    包含用于网络数据包中 Smack 直接标签表示的 CIPSO 级别。
+
+doi
+    包含网络数据包中使用的 CIPSO 解释域。
+
+ipv6host
+    此接口允许将特定的 IPv6 互联网地址视为单标签主机。数据包仅从
+    对主机标签具有 Smack 写访问权限的进程发送到单标签主机。所有从
+    单标签主机接收的数据包将被赋予指定标签。写入格式为::
+
+        "%h:%h:%h:%h:%h:%h:%h:%h label" 或
+        "%h:%h:%h:%h:%h:%h:%h:%h/%d label"
+
+    不支持 "::" 地址缩写。如果标签为 "-DELETE",将删除匹配的条目。
+
+load
+    为向后兼容保留;推荐使用 ``load2`` 接口。
+    此接口允许指定系统定义规则之外的访问控制规则。
+    写入格式为::
+
+        "%24s%24s%5s"
+
+    其中第一个字符串为主体标签,第二个为客体标签,第三个为请求的
+    访问权限。访问字符串只能包含 "rwxat-" 字符,用于指定允许的访问
+    类型。"-" 是不允许的权限的占位符。字符串 "r-x--" 表示读和执行
+    访问。标签长度限制为 23 个字符。
+
+load2
+    此接口允许指定系统定义规则之外的访问控制规则。
+    写入格式为::
+
+        "%s %s %s"
+
+    其中第一个字符串为主体标签,第二个为客体标签,第三个为请求的
+    访问权限。访问字符串只能包含 "rwxat-" 字符,用于指定允许的访问
+    类型。"-" 是不允许的权限的占位符。字符串 "r-x--" 表示读和执行
+    访问。
+
+load-self
+    为向后兼容保留;推荐使用 ``load-self2`` 接口。
+    此接口允许定义进程特定的访问规则。这些规则仅在访问本来会被允许
+    时才被查询,旨在对进程施加额外的限制。格式与 ``load`` 接口相同。
+
+load-self2
+    此接口允许定义进程特定的访问规则。这些规则仅在访问本来会被允许
+    时才被查询,旨在对进程施加额外的限制。格式与 ``load2`` 接口相同。
+
+logging
+    包含 Smack 日志状态。
+
+mapped
+    包含用于网络数据包中 Smack 映射标签表示的 CIPSO 级别。
+
+netlabel
+    此接口允许将特定的互联网地址视为单标签主机。数据包不带 CIPSO
+    标头发送到单标签主机,但仅从对主机标签具有 Smack 写访问权限的
+    进程发送。所有从单标签主机接收的数据包将被赋予指定标签。
+    写入格式为::
+
+        "%d.%d.%d.%d label" 或 "%d.%d.%d.%d/%d label"
+
+    如果指定的标签是 "-CIPSO",则该地址被视为支持 CIPSO 标头的主机。
+
+onlycap
+    包含进程必须具有的标签,才能使 ``CAP_MAC_ADMIN`` 和
+    ``CAP_MAC_OVERRIDE`` 生效。如果此文件为空,这些能力对具有任何
+    标签的进程都有效。通过向文件写入所需标签(用空格分隔)来设置值,
+    或通过写入 "-" 来清除。
+
+ptrace
+    用于定义当前的 ptrace 策略::
+
+        0 - default(默认):
+            依赖 Smack 访问规则的策略。对于 ``PTRACE_READ``,主体需要
+            对客体具有读访问权限。对于 ``PTRACE_ATTACH``,需要读写访问
+            权限。
+
+        1 - exact(精确):
+            限制 ``PTRACE_ATTACH`` 的策略。仅当主体和客体标签相同时才
+            允许 attach。``PTRACE_READ`` 不受影响。可通过
+            ``CAP_SYS_PTRACE`` 覆盖。
+
+        2 - draconian(严格):
+            此策略行为与上述 'exact' 相同,但不能通过
+            ``CAP_SYS_PTRACE`` 覆盖。
+
+revoke-subject
+    向此文件写入一个 Smack 标签,会将所有以该标签为主体的访问规则中
+    的访问权限设置为 '-'。
+
+unconfined
+    如果内核配置了 ``CONFIG_SECURITY_SMACK_BRINGUP``,具有
+    ``CAP_MAC_ADMIN`` 的进程可以向此接口写入一个标签。此后,涉及该
+    标签的访问将被记录,并且即使本来不会被允许的访问也会被放行。
+    请注意,这是危险的,可能会破坏系统的正确标签设置。切勿在生产环境
+    中使用。
+
+relabel-self
+    此接口包含进程可以通过写入 ``/proc/self/attr/current`` 转换到的
+    标签列表。通常进程可以将自己的标签更改为任何合法值,但前提是具有
+    ``CAP_MAC_ADMIN``。此接口允许没有 ``CAP_MAC_ADMIN`` 的进程将自身
+    重新标记为预定义列表中的某个标签。没有 ``CAP_MAC_ADMIN`` 的进程
+    只能更改自己的标签一次。更改后,此列表将被清空。
+    通过向文件写入所需标签(用空格分隔)来设置值,或通过写入 "-"
+    来清除。
+
+如果使用 smackload 工具,可以在 ``/etc/smack/accesses`` 中添加访问规则。
+格式为::
+
+    subjectlabel objectlabel access
+
+其中 access 是字母 rwxatb 的组合,指定具有 subjectlabel 的主体对具有
+objectlabel 的客体所允许的访问类型。如果不存在规则,则不允许任何访问。
+
+更多程序请访问 http://schaufler-ca.com 。
+
+简化强制访问控制内核(白皮书)
+====================================
+
+Casey Schaufler
+casey@schaufler-ca.com
+
+强制访问控制
+------------
+
+计算机系统采用多种方案来约束信息在使用机器的用户和服务之间的共享方式。其
+中一些方案允许程序或用户自行决定哪些其他程序或用户可以访问数据片段。这类
+方案被称为自主访问控制(DAC)机制,因为访问控制由用户自主指定。另一些方案
+则不允许用户或程序自行决定能访问什么。这类方案被称为强制访问控制(MAC)机
+制,因为在哪些用户或程序能够访问数据片段的问题上,你没有选择余地。
+
+Bell & LaPadula
+---------------
+
+从 20 世纪 80 年代中期到世纪之交,强制访问控制(MAC)一直与美国国防部标记
+纸质文件的 Bell & LaPadula 安全模型紧密关联。这种形式的 MAC 在 Capital
+Beltway 地区以及斯堪的纳维亚超级计算机中心拥有追随者,但也常被指未能满足
+通用需求。
+
+域类型强制(DTE)
+-----------------
+
+世纪之交前后,域类型强制(DTE)开始流行。此方案将用户、程序和数据组织进相
+互隔离的域中。该方案已作为流行 Linux 发行版的组成部分被广泛部署。然而,维
+护此方案所需的管理开销,以及提供安全域映射所需的对整个系统的深入理解,导致
+该方案在大多数情况下要么被禁用,要么仅以受限方式使用。
+
+Smack
+-----
+
+Smack 是一种强制访问控制机制,旨在提供有用的 MAC 功能,同时避免前人的缺陷。
+针对 Bell & LaPadula 的限制,Smack 通过提供一种方案使访问控制可以依据系统
+需求及目的灵活调整,而非受制于晦涩的政府政策。针对域类型强制的复杂性,Smack
+通过用已有的访问模式定义访问控制来加以避免。
+
+Smack 术语
+----------
+
+以下用于讨论 Smack 的术语,对接触过其他 MAC 系统的人来说会很熟悉,对初学者
+也不应太难掌握。其中四个术语有特定用法,尤为重要:
+
+  主体(Subject):
+	主体是计算机系统上的活动实体。在 Smack 中,主体即任务(task),
+	任务又是执行的基本单元。
+
+  客体(Object):
+	客体是计算机系统上的被动实体。在 Smack 中,各类文件、IPC 和任务
+	都可以是客体。
+
+  访问(Access):
+	主体试图向客体写入信息或从客体读取信息的任何尝试都是一次访问。
+
+  标签(Label):
+	标识主体或客体强制访问控制特征的数据。
+
+这些定义与安全社区中的传统用法一致。此外,还有一些来自 Linux 且可能频繁出
+现的术语:
+
+  能力(Capability):
+	拥有某项能力的任务有权违反系统安全策略的某个特定方面,该方面由
+	该项具体能力所标识。拥有一项或多项能力的任务是特权任务,而没有
+	任何能力的任务是非特权任务。
+
+  特权(Privilege):
+	被允许违反系统安全策略的任务称为拥有特权。截至本文撰写时,任务
+	可以通过拥有 capabilities 或者拥有 root 的有效用户来获得特权。
+
+Smack 基础
+----------
+
+Smack 是对 Linux 系统的扩展。它根据附加在主体和客体上的标签,对哪些主体可
+以访问哪些客体施加额外的限制。
+
+标签
+~~~~
+
+Smack 标签是 ASCII 字符串。标签最长可达 255 个字符,但建议控制在 23 个字符
+以内。使用特殊字符(即字母或数字以外的任何字符)的单字符标签保留给 Smack
+开发团队使用。Smack 标签是无结构的、大小写敏感的,对标签执行的唯一操作是相
+等性比较。Smack 标签不能包含不可打印字符、"/"(斜线)、"\"(反斜线)、"'"
+(单引号)和 '"'(双引号)字符。
+
+Smack 标签不能以 '-' 开头。该字符保留给特殊选项使用。
+
+存在一些预定义的标签::
+
+	_ 	读作 "floor",单个下划线字符。
+	^ 	读作 "hat", 单个抑扬符号字符。
+	* 	读作 "star",单个星号字符。
+	? 	读作 "huh",  单个问号字符。
+	@ 	读作 "web", 单个 at 符号字符。
+
+Smack 系统上的每个任务都会被分配一个标签。进程的 Smack 标签通常由系统初始
+化机制分配。
+
+访问规则
+~~~~~~~~
+
+Smack 使用 Linux 的传统访问模式。这些模式包括读(read)、执行(execute)、
+写(write),有时还包括追加(append)。有几种情况下访问模式可能不那么显而易
+见,包括:
+
+  信号(Signals):
+	信号是从主体任务到客体任务的一次写操作。
+
+  Internet 域 IPC:
+	数据包的传输被视为从源任务到目标任务的一次写操作。
+
+Smack 根据主体所附标签及其试图访问的客体所附标签来限制访问。所强制执行的规
+则按优先级依次为:
+
+	1. 任何由标签为 "*" 的任务请求的访问一律拒绝。
+	2. 任何由标签为 "^" 的任务请求的读或执行访问一律允许。
+	3. 任何对标签为 "_" 的客体请求的读或执行访问一律允许。
+	4. 任何对标签为 "*" 的客体请求的访问一律允许。
+	5. 任何由任务对其标签相同的客体请求的访问一律允许。
+	6. 任何在已加载规则集中明确定义的访问请求一律允许。
+	7. 任何其他访问一律拒绝。
+
+Smack 访问规则
+~~~~~~~~~~~~~~
+
+借助 Smack 提供的隔离性,访问分离变得简单。同时也有许多有趣的场景,需要主体
+对不同标签的客体进行有限制的访问。一个例子是熟悉的敏感性间谍模型:从事高度
+机密项目的科学家可以阅读低密级文档,而她撰写的内容将会"诞生"为高密级。为
+了适应此类方案,Smack 包含了一种用于指定允许跨标签访问的规则机制。
+
+访问规则格式
+~~~~~~~~~~~~
+
+访问规则的格式为::
+
+	subject-label object-label access
+
+其中 subject-label 是任务的 Smack 标签,object-label 是被访问对象的 Smack
+标签,access 是一个指定所允许访问类型的字符串。访问规格说明中会查找以下描述
+访问模式的字母:
+
+	a:	表示应授予追加(append)访问权限。
+	r:	表示应授予读(read)访问权限。
+	w:	表示应授予写(write)访问权限。
+	x:	表示应授予执行(execute)访问权限。
+	t:	表示该规则请求 transmutation(标签转变)。
+	b:	表示该规则应报告用于 bring-up 调试。
+
+规格说明字母的大写形式同样有效。访问模式规格说明的顺序可以任意。以下是一些
+可接受规则的示例::
+
+	TopSecret Secret  rx
+	Secret    Unclass R
+	Manager   Game    x
+	User      HR      w
+	Snap      Crackle rwxatb
+	New       Old     rRrRr
+	Closed    Off     -
+
+以下是一些不可接受规则的示例::
+
+	Top Secret Secret     rx
+	Ace        Ace        r
+	Odd        spells     waxbeans
+
+标签中不允许包含空格。由于主体总是可以访问具有相同标签的文件,因此为此类情
+况指定规则没有意义。访问规格说明中只允许有效的字母(rwxatbRWXATB)和破折号
+('-')。破折号是占位符,因此 "a-r" 与 "ar" 相同。单独一个破折号用于指明不
+允许任何访问。
+
+应用访问规则
+~~~~~~~~~~~~
+
+Linux 的开发者很少定义全新的事物类型,通常是从其他系统中引入方案和概念。最
+常见的情况是,这些其他系统是 Unix 的变体。Unix 有许多讨人喜欢的特性,但访
+问控制模型的一致性并非其中之一。Smack 力求在尽可能保持一致地处理访问的同时,
+不失底层机制的精神。
+
+文件系统对象(包括文件、目录、命名管道、符号链接和设备)所需的访问权限与
+mode bit 访问所使用的权限紧密匹配。打开文件进行读取需要对文件具有读访问权限。
+搜索目录需要执行访问权限。创建文件并写访问需要对包含目录具有读写访问权限。
+删除文件需要对文件以及包含目录具有读写访问权限。用户可能能够看到某个文件存
+在,但由于对包含目录具有读访问权限而对具有不同标签的文件没有读访问权限,因
+此看不到它的任何属性。这是文件名作为目录中的数据而非文件一部分所带来的现象。
+
+如果目录被标记为 transmuting(SMACK64TRANSMUTE=TRUE),并且某访问规则允许
+进程在该目录中创建对象且该规则包含 't' 访问模式,则新对象将被赋予目录的标
+签而非创建进程的标签。这使得两个拥有不同标签的进程更容易共享数据,而无需授
+予对彼此所有文件的访问权限。
+
+IPC 对象(消息队列、信号量集合和内存段)存在于扁平的名字空间中,访问请求只
+需与目标对象匹配即可。
+
+进程对象反映系统上的任务,用于访问它们的 Smack 标签与任务用于自身访问尝试
+的 Smack 标签相同。通过 kill() 系统调用发送信号是从发送者到接收者的一次写
+操作。调试进程需要读写两种访问权限。创建新任务是一个内部操作,会产生两个具
+有相同 Smack 标签的任务,并且无需访问检查。
+
+套接字(Socket)是附加在进程上的数据结构,从一个进程向另一个进程发送数据包
+要求发送者对接收者具有写访问权限。接收者不需要对发送者具有读访问权限。
+
+设置访问规则
+~~~~~~~~~~~~
+
+配置文件 /etc/smack/accesses 包含需要在系统启动时设置的规则。其内容会被写
+入特殊文件 /sys/fs/smackfs/load2。规则可以随时添加并立即生效。对于任意一对
+主体标签和客体标签,只能存在一条规则,最后指定的规则将覆盖之前的规定。
+
+任务属性
+~~~~~~~~
+
+进程的 Smack 标签可以从 ``/proc/<pid>/attr/current`` 读取。进程可以从
+``/proc/self/attr/current`` 读取自身的 Smack 标签。特权进程可以通过向
+``/proc/self/attr/current`` 写入来更改自身的 Smack 标签,但不能更改其他进程
+的标签。
+
+写入格式为:仅写入标签本身,或者标签后跟以下三种尾部字符之一:``\n``
+(根据 ``/proc/...`` 接口的通用约定)、``\0`` (因为某些应用程序会错误地包含
+它)、``\n\0`` (因为我们认为某些应用程序可能会错误地包含它)。
+
+文件属性
+~~~~~~~~
+
+文件系统对象的 Smack 标签以扩展属性形式存储在文件上,属性名为 SMACK64。此
+属性位于 security 名字空间。只能由具有特权的进程更改。
+
+特权
+~~~~
+
+拥有 CAP_MAC_OVERRIDE 或 CAP_MAC_ADMIN 的进程是特权进程。CAP_MAC_OVERRIDE
+允许进程访问原本会被拒绝的客体。CAP_MAC_ADMIN 允许进程更改 Smack 数据,包括
+规则和属性。
+
+Smack 网络
+~~~~~~~~~~
+
+如前所述,Smack 对网络协议传输强制执行访问控制。Smack 进程发送的每个数据包
+都带有其 Smack 标签。这是通过在 IP 数据包头部添加 CIPSO 标签实现的。每个接
+收的数据包都应有一个标识标签的 CIPSO 标签,如果没有,则假定为网络环境标签。
+在数据包投递之前,会检查数据包上标签对应的主体是否对接收进程具有写访问权限,
+如果不满足,数据包将被丢弃。
+
+CIPSO 配置
+~~~~~~~~~~
+
+通常不需要指定 CIPSO 配置。系统使用的默认值可以处理所有内部情况。Smack 会
+自动组合 CIPSO 标签值以匹配正在使用的 Smack 标签,无需管理员干预。进入系统
+的未标记数据包将被赋予环境标签。
+
+当可能遇到来自非 Smack 但支持 CIPSO 的系统时,Smack 需要额外配置。通常这
+会是 Trusted Solaris 系统,但还有其他较少部署的系统存在。CIPSO 为每个数据包
+提供 3 个重要值:解释域(DOI)、级别(level)和类别集(category set)。DOI
+用于标识一组使用兼容标记方案的系统,Smack 系统上指定的 DOI 必须与远程系统
+匹配,否则数据包将被丢弃。DOI 的默认值为 3。该值可以从 /sys/fs/smackfs/doi
+读取,也可以通过写入 /sys/fs/smackfs/doi 来更改。
+
+标签和类别集被映射为 /etc/smack/cipso 中定义的 Smack 标签。
+
+Smack/CIPSO 映射的形式为::
+
+	smack level [category [category]*]
+
+Smack 不要求 level 或 category 集合以任何特定方式相互关联,也不基于它们假
+定或分配访问权限。一些映射示例::
+
+	TopSecret 7
+	TS:A,B    7 1 2
+	SecBDE    5 2 4 6
+	RAFTERS   7 12 26
+
+Smack 标签中允许使用 ":" 和 "," 字符,但没有特殊含义。
+
+Smack 标签到 CIPSO 值的映射通过写入 /sys/fs/smackfs/cipso2 来定义。
+
+除显式映射外,Smack 还支持直接 CIPSO 映射。其中一个 CIPSO 级别用于指示数据
+包中传递的类别集实际上是 Smack 标签的编码。默认使用的级别是 250。该值可以
+从 /sys/fs/smackfs/direct 读取,并通过写入 /sys/fs/smackfs/direct 来更改。
+
+套接字属性
+~~~~~~~~~~
+
+有两个属性与套接字相关联。这些属性只能由特权任务设置,但任何任务都可以读取
+自己套接字的这些属性。
+
+  SMACK64IPIN:
+	任务对象的 Smack 标签。一个将强制执行策略的特权程序可以将其设置
+	为 star 标签。
+
+  SMACK64IPOUT:
+	随传出数据包传输的 Smack 标签。特权程序可以将其设置为希望与之通
+	信的另一个任务的标签。
+
+带有 BSD 地址的 UNIX 域套接字(UDS)既是文件系统中的文件也是套接字。作为文
+件,它携带 SMACK64 属性。此属性不参与 Smack 安全强制执行,并被不可变地分配
+标签 "*"。
+
+Smack Netlabel 例外
+~~~~~~~~~~~~~~~~~~~
+
+你会发现带有标签的应用程序经常需要与外部未标记的世界通信。为此,有一个特殊
+文件 /sys/fs/smackfs/netlabel,你可以在其中添加例外,格式为::
+
+	@IP1	   LABEL1 或
+	@IP2/MASK  LABEL2
+
+这意味着,如果你的应用程序对 LABEL1 具有写访问权限,则它将对 @IP1 具有未标
+记访问权限;如果它对 LABEL2 具有写访问权限,则将对子网 @IP2/MASK 具有访问
+权限。
+
+/sys/fs/smackfs/netlabel 文件中的条目按最长掩码优先进行匹配,类似于无类别
+IPv4 路由。
+
+其中还可以使用特殊标签 '@' 和选项 '-CIPSO'::
+
+	@      表示互联网,任何标签的应用程序都可以访问它
+	-CIPSO 表示标准 CIPSO 网络
+
+如果你不知道 CIPSO 是什么并且不打算使用它,只需执行::
+
+	echo 127.0.0.1 -CIPSO > /sys/fs/smackfs/netlabel
+	echo 0.0.0.0/0 @      > /sys/fs/smackfs/netlabel
+
+如果在 192.168.0.0/16 局域网中使用 CIPSO 同时还需要未标记的互联网访问::
+
+	echo 127.0.0.1      -CIPSO > /sys/fs/smackfs/netlabel
+	echo 192.168.0.0/16 -CIPSO > /sys/fs/smackfs/netlabel
+	echo 0.0.0.0/0      @      > /sys/fs/smackfs/netlabel
+
+为 Smack 编写应用程序
+---------------------
+
+在 Smack 系统上将运行三类应用程序。应用程序如何与 Smack 交互,决定了它需要
+做些什么才能在 Smack 下正常工作。
+
+不了解 Smack 的应用程序
+-----------------------
+
+绝大多数应用程序完全不需要关心 Smack 的独特属性。由于调用程序不会影响与进程
+关联的 Smack 标签,唯一可能引起关注的是进程是否对程序具有执行访问权限。
+
+了解 Smack 的应用程序
+---------------------
+
+有些程序可以通过了解 Smack 而变得更好用,但它们本身不做安全决策。ls(1) 命令
+就是这样一个程序的例子。
+
+强制执行 Smack 的应用程序
+-------------------------
+
+这些是特殊的程序,不仅了解 Smack,还参与系统策略的执行。在大多数情况下,这
+些是设置用户会话的程序。还有一些网络服务,为以不同标签运行的进程提供信息。
+
+文件系统接口
+------------
+
+Smack 使用扩展属性在文件系统对象上维护标签。可以使用 getxattr(2) 获取文件、
+目录或其他文件系统对象的 Smack 标签::
+
+	len = getxattr("/", "security.SMACK64", value, sizeof (value));
+
+将把根目录的 Smack 标签放入 value 中。特权进程可以使用 setxattr(2) 设置文件
+系统对象的 Smack 标签::
+
+	len = strlen("Rubble");
+	rc = setxattr("/foo", "security.SMACK64", "Rubble", len, 0);
+
+如果程序具有适当特权,将把 /foo 的 Smack 标签设置为 "Rubble"。
+
+套接字接口
+----------
+
+可以使用 fgetxattr(2) 读取套接字属性。
+
+特权进程可以使用 fsetxattr(2) 设置传出数据包的 Smack 标签::
+
+	len = strlen("Rubble");
+	rc = fsetxattr(fd, "security.SMACK64IPOUT", "Rubble", len, 0);
+
+如果程序具有适当特权,将把从该套接字发出的数据包的 Smack 标签设置为
+"Rubble"::
+
+	rc = fsetxattr(fd, "security.SMACK64IPIN, "*", strlen("*"), 0);
+
+如果程序具有适当特权,将把 Smack 标签 "*" 设置为检查传入数据包的客体标签。
+
+管理
+----
+
+Smack 支持一些挂载选项:
+
+  smackfsdef=label:
+	指定分配给缺少 Smack 标签扩展属性的文件的标签。
+
+  smackfsroot=label:
+	指定分配给文件系统根目录的标签(如果它缺少 Smack 扩展属性)。
+
+  smackfshat=label:
+	指定一个标签,该标签必须对文件系统上设置的所有标签具有读访问权限。
+	尚未强制执行。
+
+  smackfsfloor=label:
+	指定一个标签,文件系统上设置的所有标签必须对其具有读访问权限。
+	尚未强制执行。
+
+  smackfstransmute=label:
+	行为与 smackfsroot 完全一致,只是在挂载的根目录上同时设置
+	transmute 标志。
+
+这些挂载选项适用于所有文件系统类型。
+
+Smack 审计
+----------
+
+如果需要对安全事件进行 Smack 审计,需要在内核配置中设置 CONFIG_AUDIT。
+默认情况下,所有被拒绝的事件都会被审计。可以通过向 /sys/fs/smackfs/logging
+文件写入单个字符来更改此行为::
+
+	0 : 不记录日志
+	1 : 记录被拒绝的事件(默认)
+	2 : 记录被接受的事件
+	3 : 记录被拒绝和被接受的事件
+
+事件以 'key=value' 对的形式记录。对于每个事件,至少会得到主体、客体、请求
+的权限、动作、触发事件的内核函数,以及根据审计事件类型而定的其他键值对。
+
+Bringup 模式
+------------
+
+Bringup 模式提供了日志记录功能,可以使应用程序配置和系统 bringup 更加容易。
+在内核中配置 CONFIG_SECURITY_SMACK_BRINGUP 以启用这些功能。启用 bringup 模
+式后,由于标记了 "b" 访问模式的规则而成功的访问将被记录。当为进程引入新标签
+时,可以积极地添加标记了 "b" 的规则。日志记录使你可以跟踪哪些规则实际上被用
+于该标签。
+
+Bringup 模式的另一个功能是 "unconfined" 选项。向 /sys/fs/smackfs/unconfined
+写入一个标签,将使具有该标签的主体能够访问任何客体,同时使具有该标签的客体
+能被所有主体访问。任何由于标签被设置为 unconfined 而被允许的访问都将被记录。
+此功能是危险的,因为文件和目录可能会在策略正常执行时不允许的位置被创建。
-- 
2.43.0


^ permalink raw reply related

* [PATCH 01/10] docs/zh_CN: add LSM/index Chinese translation
From: Yan Zhu @ 2026-06-12 15:58 UTC (permalink / raw)
  To: alexs, si.yanteng, corbet, mic
  Cc: dzm91, skhan, gnoack, zhuyan2015, linux-doc,
	linux-security-module
In-Reply-To: <cover.1781105672.git.zhuyan2015@qq.com>

Translate Documentation/admin-guide/LSM/index.rst into Chinese, and
update admin-guide/index.rst to include LSM/index in the toctree.

Update the translation through commit 504f231cda56
("doc: ReSTify and split LSM.txt")

Assisted-by: Claude:deepseek-4-pro
Signed-off-by: Yan Zhu <zhuyan2015@qq.com>
---
 .../zh_CN/admin-guide/LSM/index.rst           | 46 +++++++++++++++++++
 .../translations/zh_CN/admin-guide/index.rst  |  3 +-
 2 files changed, 48 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/index.rst

diff --git a/Documentation/translations/zh_CN/admin-guide/LSM/index.rst b/Documentation/translations/zh_CN/admin-guide/LSM/index.rst
new file mode 100644
index 000000000000..21e2b00af544
--- /dev/null
+++ b/Documentation/translations/zh_CN/admin-guide/LSM/index.rst
@@ -0,0 +1,46 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../../disclaimer-zh_CN.rst
+
+:Original: Documentation/admin-guide/LSM/index.rst
+:翻译:
+ 朱岩 Yan Zhu <zhuyan2015@qq.com>
+
+==================
+Linux 安全模块用法
+==================
+
+Linux 安全模块(LSM)框架提供了一种机制,允许新内核扩展在各种安全检查点挂载钩
+子。“module” 一词实际上有些误导,因为这些扩展并不是可加载的内核模块。它们在编
+译时通过 CONFIG_DEFAULT_SECURITY 选择,当多个 LSM 被编译进同一内核时,可以在
+启动时通过 “security=…” 内核命令行参数覆盖。
+
+LSM 接口的主要使用者是强制访问控制(MAC)扩展,提供完整的安全策略。典型例子包
+括 SELinux、Smack、Tomoyo、AppArmor。除了这些大型 MAC 扩展外,还可以利用 LSM
+构建其他扩展,在 Linux 本身未提供的系统行为上做特定修改。
+
+Linux capability 模块始终会被包含。其后可能出现任意数量的“次要”模块,且最多
+只有一个“主要”模块。有关 capability 模块的详细信息,请参见 Linux man‑pages
+项目中的 ``capabilities(7)`` 手册页。
+
+可以通过读取 ``/sys/kernel/security/lsm`` 查看当前激活的安全模块列表。该列表
+以逗号分隔,并始终包括 capability 模块。列表的顺序即检查顺序。capability 模
+块始终排在第一位,如果系统配置了的话,随后是所有“次要”模块(例如 Yama),然后
+是“主要”模块(例如 SELinux)。
+
+与“主要”安全模块关联的进程属性应通过 ``/proc/.../attr`` 中的特殊文件访问和维
+护。安全模块可能在该目录下维护以其名字命名的子目录,例如
+``/proc/.../attr/smack`` 由 Smack 模块提供,包含其所有专用文件。
+``/proc/.../attr`` 中的文件仍然是为提供子目录的模块保留的旧接口。
+
+.. toctree::
+   :maxdepth: 1
+
+   apparmor
+   LoadPin
+   SELinux
+   Smack
+   tomoyo
+   Yama
+   SafeSetID
+   ipe
+   landlock
diff --git a/Documentation/translations/zh_CN/admin-guide/index.rst b/Documentation/translations/zh_CN/admin-guide/index.rst
index bd01cf6474c8..10f9e3c577c3 100644
--- a/Documentation/translations/zh_CN/admin-guide/index.rst
+++ b/Documentation/translations/zh_CN/admin-guide/index.rst
@@ -52,10 +52,11 @@ Todolist:
 .. toctree::
    :maxdepth: 1
 
+   LSM/index
+
 Todolist:
 
 *   hw-vuln/index
-*   LSM/index
 *   perf-security
 
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH 00/10] docs/zh_CN: add LSM admin-guide Chinese translation
From: Yan Zhu @ 2026-06-12 15:58 UTC (permalink / raw)
  To: alexs, si.yanteng, corbet, mic
  Cc: dzm91, skhan, gnoack, zhuyan2015, linux-doc,
	linux-security-module

This patch series adds the Chinese translation for the Linux Security Module
(LSM) admin-guide documentation, including the main index and all ten
sub-pages: apparmor, LoadPin, SELinux, Smack, tomoyo, Yama, SafeSetID,
ipe, and landlock.  The admin-guide toctree is also updated to list the
newly translated LSM/index.

The original English documentation was restructured into reStructuredText
by Kees Cook in commit 504f231cda56 ("doc: ReSTify and split LSM.txt"),
and each sub-page has been kept up-to-date with subsequent mainline changes.


base: https://git.kernel.org/pub/scm/linux/kernel/git/alexs/linux.git docs-next



Yan Zhu (10):
  docs/zh_CN: add LSM/index Chinese translation
  docs/zh_CN: add LSM/apparmor Chinese translation
  docs/zh_CN: add LSM/LoadPin Chinese translation
  docs/zh_CN: add LSM/SELinux Chinese translation
  docs/zh_CN: add LSM/Smack Chinese translation
  docs/zh_CN: add LSM/tomoyo Chinese translation
  docs/zh_CN: add LSM/Yama Chinese translation
  docs/zh_CN: add LSM/SafeSetID Chinese translation
  docs/zh_CN: add LSM/ipe Chinese translation
  docs/zh_CN: add LSM/landlock Chinese translation

 .../zh_CN/admin-guide/LSM/LoadPin.rst         |  33 +
 .../zh_CN/admin-guide/LSM/SELinux.rst         |  45 ++
 .../zh_CN/admin-guide/LSM/SafeSetID.rst       |  82 ++
 .../zh_CN/admin-guide/LSM/Smack.rst           | 722 +++++++++++++++++
 .../zh_CN/admin-guide/LSM/Yama.rst            |  71 ++
 .../zh_CN/admin-guide/LSM/apparmor.rst        |  59 ++
 .../zh_CN/admin-guide/LSM/index.rst           |  46 ++
 .../zh_CN/admin-guide/LSM/ipe.rst             | 723 ++++++++++++++++++
 .../zh_CN/admin-guide/LSM/landlock.rst        | 169 ++++
 .../zh_CN/admin-guide/LSM/tomoyo.rst          |  63 ++
 .../translations/zh_CN/admin-guide/index.rst  |   3 +-
 11 files changed, 2015 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/LoadPin.rst
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/SELinux.rst
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/SafeSetID.rst
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/Smack.rst
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/Yama.rst
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/apparmor.rst
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/index.rst
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/ipe.rst
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/landlock.rst
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/tomoyo.rst

-- 
2.43.0


^ permalink raw reply

* [PATCH 07/10] docs/zh_CN: add LSM/Yama Chinese translation
From: Yan Zhu @ 2026-06-12 15:58 UTC (permalink / raw)
  To: alexs, si.yanteng, corbet, mic
  Cc: dzm91, skhan, gnoack, zhuyan2015, linux-doc,
	linux-security-module
In-Reply-To: <cover.1781105672.git.zhuyan2015@qq.com>

Translate Documentation/admin-guide/LSM/Yama.rst into Chinese.

Update the translation through commit 9d1bd9e8e028
("doc: yama: Swap HTTP for HTTPS and replace dead link")

Assisted-by: Claude:deepseek-4-pro
Signed-off-by: Yan Zhu <zhuyan2015@qq.com>
---
 .../zh_CN/admin-guide/LSM/Yama.rst            | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/Yama.rst

diff --git a/Documentation/translations/zh_CN/admin-guide/LSM/Yama.rst b/Documentation/translations/zh_CN/admin-guide/LSM/Yama.rst
new file mode 100644
index 000000000000..ada3ec079432
--- /dev/null
+++ b/Documentation/translations/zh_CN/admin-guide/LSM/Yama.rst
@@ -0,0 +1,71 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../../disclaimer-zh_CN.rst
+
+:Original: Documentation/admin-guide/LSM/Yama.rst
+:翻译:
+ 朱岩 Yan Zhu <zhuyan2015@qq.com>
+
+
+====
+Yama
+====
+
+Yama 是一种 Linux 安全模块(LSM),用于收集系统范围内的 DAC(自主访问控制)
+安全保护,这些保护并不是由内核本身直接处理的。在编译时可通过
+``CONFIG_SECURITY_YAMA`` 选择,并可在运行时通过 ``/proc/sys/kernel/yama``
+中的 sysctl 接口控制:
+
+ptrace_scope
+============
+
+随着 Linux 的流行度提升,它将成为更大的恶意软件攻击目标。Linux 进程接口的一个
+突出弱点是单个用户能够检查其拥有的任意进程的内存和运行状态。例如,若 Pidgin
+被入侵,攻击者即可附加到其他运行中的进程(如 Firefox、SSH 会话、GPG 代理等),
+提取更多凭证,并在不依赖用户钓鱼的情况下扩大攻击范围。
+
+这不是理论上的问题。已有文献记载 `SSH 会话劫持 <yama_ssh_hijack_>`_ 和
+`任意代码注入 <yama_code_injection_>`_ 攻击,这些攻击在允许 ptrace 如前所述
+时仍然可能发生。由于 ptrace 并不是非开发者和非管理员常用的功能,系统构建者应
+当能够选择关闭此调试机制。
+
+一种解决方案是某些应用使用 ``prctl(PR_SET_DUMPABLE, ...)`` 明确禁止 ptrace
+附加(如 ssh‑agent),但大多数并未如此。更通用的方案是仅允许父进程向子进程的
+ptrace(即 ``gdb <child>``、``strace <child>`` 仍可工作),或通过
+``CAP_SYS_PTRACE``(即 root 仍可使用 ``gdb --pid=PID``、``strace -p PID``)。
+
+在模式 1 中,软件可以通过 ``prctl(PR_SET_PTRACER, pid, ...)`` 为调试进程与其
+子进程之间定义特定关系。子进程可声明哪些进程(及其后代)被允许调用
+``PTRACE_ATTACH``。每个子进程同一时间只能有一个此类声明的调试进程。例如 KDE、
+Chromium、Firefox 的崩溃处理器以及 Wine 用于相互 ptrace 的进程均采用此方式。
+若进程希望完全禁用这些限制,可调用
+``prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, ...)``,从而允许任何已被允许的进
+程(即使在外部 pid 命名空间)进行附加。
+
+sysctl 配置如下(仅在拥有 ``CAP_SYS_PTRACE`` 时可写):
+
+0 - 经典 ptrace 权限:
+    一个进程可以对任何在相同 uid 下运行的其他进程执行 `PTRACE_ATTACH`
+    操作,只要目标进程是可转储的(即未转换 uid、未以特权启动或未调用过
+    `prctl(PR_SET_DUMPABLE...)`)。同样,`PTRACE_TRACEME` 保持不变。
+
+1 - 限制性 ptrace:
+    进程必须与其欲 attach 的子进程预先建立关系。默认关系为仅限其后代,且满
+    足上述经典条件。若要更改这种关系,子进程可以调用
+    ``prctl(PR_SET_PTRACER, debugger, ...)`` 来声明允许的调试器 PID,以便在
+    该子进程上调用 ``PTRACE_ATTACH``。使用 ``PTRACE_TRACEME`` 则保持不变。
+
+2 - 仅管理员可 attach:
+    只有具备 ``CAP_SYS_PTRACE`` 权限的进程才能使用 ptrace,无论是通过
+    ``PTRACE_ATTACH`` 还是通过子进程调用 ``PTRACE_TRACEME``)。
+
+3 - 禁止 attach:
+    任意进程不可使用 ``PTRACE_ATTACH`` 或 ``PTRACE_TRACEME``。一旦设定,此值
+    不可更改。
+
+最初的仅限于子进程的逻辑源自 grsecurity 中的限制条件制定的。
+
+.. _yama_ssh_hijack:
+ https://www.blackhat.com/presentations/bh-usa-05/bh-us-05-boileau.pdf
+
+.. _yama_code_injection:
+ https://c-skills.blogspot.com/2007/05/injectso.html
-- 
2.43.0


^ permalink raw reply related

* [PATCH 06/10] docs/zh_CN: add LSM/tomoyo Chinese translation
From: Yan Zhu @ 2026-06-12 15:58 UTC (permalink / raw)
  To: alexs, si.yanteng, corbet, mic
  Cc: dzm91, skhan, gnoack, zhuyan2015, linux-doc,
	linux-security-module
In-Reply-To: <cover.1781105672.git.zhuyan2015@qq.com>

Translate Documentation/admin-guide/LSM/tomoyo.rst into Chinese.

Update the translation through commit c6144a21169f
("tomoyo: update project links")

Assisted-by: Claude:deepseek-4-pro
Signed-off-by: Yan Zhu <zhuyan2015@qq.com>
---
 .../zh_CN/admin-guide/LSM/tomoyo.rst          | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/tomoyo.rst

diff --git a/Documentation/translations/zh_CN/admin-guide/LSM/tomoyo.rst b/Documentation/translations/zh_CN/admin-guide/LSM/tomoyo.rst
new file mode 100644
index 000000000000..a354c2ee1b35
--- /dev/null
+++ b/Documentation/translations/zh_CN/admin-guide/LSM/tomoyo.rst
@@ -0,0 +1,63 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../../disclaimer-zh_CN.rst
+
+:Original: Documentation/admin-guide/LSM/tomoyo.rst
+:翻译:
+ 朱岩 Yan Zhu <zhuyan2015@qq.com>
+
+
+======
+TOMOYO
+======
+
+TOMOYO 是什么?
+==============
+
+TOMOYO 是 Linux 内核中一种基于名称的 MAC(强制访问控制)扩展(LSM 模块)。
+
+LiveCD 示例教程可在以下地址获取:
+
+https://tomoyo.sourceforge.net/1.8/ubuntu12.04-live.html
+https://tomoyo.sourceforge.net/1.8/centos6-live.html
+
+虽然这些教程使用的是非 LSM 版本的 TOMOYO,但对理解 TOMOYO 的概念仍有帮助。
+
+如何启用 TOMOYO?
+================
+
+构建内核时启用 ``CONFIG_SECURITY_TOMOYO=y``,并在内核命令行加入
+``security=tomoyo`` 参数。
+
+详情请参阅 https://tomoyo.sourceforge.net/2.6/ 。
+
+文档在哪里?
+===========
+
+用户 ↔ 内核接口文档位于:
+
+https://tomoyo.sourceforge.net/2.6/policy-specification/index.html
+
+我们为研讨会和会议准备的材料可在以下地址获取:
+
+https://sourceforge.net/projects/tomoyo/files/docs/
+
+
+以下列出了三个方面精选的资料:
+
+TOMOYO 是什么?
+  TOMOYO Linux Overview
+    https://sourceforge.net/projects/tomoyo/files/docs/lca2009-takeda.pdf
+  TOMOYO Linux: pragmatic and manageable security for Linux
+    https://sourceforge.net/projects/tomoyo/files/docs/freedomhectaipei-tomoyo.pdf
+  TOMOYO Linux: A Practical Method to Understand and Protect Your Own Linux Box
+    https://sourceforge.net/projects/tomoyo/files/docs/PacSec2007-en-no-demo.pdf
+
+TOMOYO 能干什么?
+  Deep inside TOMOYO Linux
+    https://sourceforge.net/projects/tomoyo/files/docs/lca2009-kumaneko.pdf
+  The role of "pathname based access control" in security.
+    https://sourceforge.net/projects/tomoyo/files/docs/lfj2008-bof.pdf
+
+TOMOYO 的历史?
+  Realities of Mainlining
+    https://sourceforge.net/projects/tomoyo/files/docs/lfj2008.pdf
-- 
2.43.0


^ permalink raw reply related

* [PATCH 04/10] docs/zh_CN: add LSM/SELinux Chinese translation
From: Yan Zhu @ 2026-06-12 15:58 UTC (permalink / raw)
  To: alexs, si.yanteng, corbet, mic
  Cc: dzm91, skhan, gnoack, zhuyan2015, linux-doc,
	linux-security-module
In-Reply-To: <cover.1781105672.git.zhuyan2015@qq.com>

Translate Documentation/admin-guide/LSM/SELinux.rst into Chinese.

Update the translation through commit 17bd3c01667a
("documentation: add links to SELinux resources")

Assisted-by: Claude:deepseek-4-pro
Signed-off-by: Yan Zhu <zhuyan2015@qq.com>
---
 .../zh_CN/admin-guide/LSM/SELinux.rst         | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/SELinux.rst

diff --git a/Documentation/translations/zh_CN/admin-guide/LSM/SELinux.rst b/Documentation/translations/zh_CN/admin-guide/LSM/SELinux.rst
new file mode 100644
index 000000000000..4962c7c24ec2
--- /dev/null
+++ b/Documentation/translations/zh_CN/admin-guide/LSM/SELinux.rst
@@ -0,0 +1,45 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../../disclaimer-zh_CN.rst
+
+:Original: Documentation/admin-guide/LSM/SELinux.rst
+:翻译:
+ 朱岩 Yan Zhu <zhuyan2015@qq.com>
+
+
+=======
+SELinux
+=======
+
+关于 SELinux 内核子系统的信息可以在以下链接获取:
+
+    https://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git/tree/README.md
+
+    https://github.com/selinuxproject/selinux-kernel/wiki
+
+关于 SELinux 用户空间的资料可以在以下地址找到:
+
+    https://github.com/SELinuxProject/selinux/wiki
+
+如果你想使用 SELinux,通常需要使用发行版提供的策略,或者从以下地址获取最新的
+参考策略:
+
+    https://github.com/SELinuxProject/refpolicy
+
+如果你仅需安装一个用于测试的示例策略,可以使用位于 scripts/selinux 下的
+``mdp`` 工具。注意这要求系统已经安装了 SELinux 用户空间工具,尤其需要
+``checkpolicy`` 来编译内核策略,以及 ``setfiles`` 和 ``fixfiles`` 来为文件系
+统打标签。
+
+    1. 编译内核并启用 SELinux。
+    2. 运行 ``make`` 编译 ``mdp``。
+    3. 确认当前未启用 SELinux 且未加载真实策略。
+       如果已启用,请在继续前重启并在 SELinux 已禁用状态下操作。
+    4. 执行 ``install_policy.sh``::
+
+        cd scripts/selinux
+        sh install_policy.sh
+
+第 4 步会为当前内核生成一个新的示例策略,包含单一的 SELinux 用户、角色和类型。
+它会编译该策略,将 ``SELINUXTYPE`` 设置为 ``dummy``
+(写入 ``/etc/selinux/config``),并将策略以 ``dummy`` 名称安装,同时重新标
+记文件系统。
-- 
2.43.0


^ permalink raw reply related

* [PATCH 02/10] docs/zh_CN: add LSM/apparmor Chinese translation
From: Yan Zhu @ 2026-06-12 15:58 UTC (permalink / raw)
  To: alexs, si.yanteng, corbet, mic
  Cc: dzm91, skhan, gnoack, zhuyan2015, linux-doc,
	linux-security-module
In-Reply-To: <cover.1781105672.git.zhuyan2015@qq.com>

Translate Documentation/admin-guide/LSM/apparmor.rst into Chinese.

Update the translation through commit d00c2359fc18
("Docs: Update LSM/apparmor.rst")

Assisted-by: Claude:deepseek-4-pro
Signed-off-by: Yan Zhu <zhuyan2015@qq.com>
---
 .../zh_CN/admin-guide/LSM/apparmor.rst        | 59 +++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 Documentation/translations/zh_CN/admin-guide/LSM/apparmor.rst

diff --git a/Documentation/translations/zh_CN/admin-guide/LSM/apparmor.rst b/Documentation/translations/zh_CN/admin-guide/LSM/apparmor.rst
new file mode 100644
index 000000000000..6b0638aedacb
--- /dev/null
+++ b/Documentation/translations/zh_CN/admin-guide/LSM/apparmor.rst
@@ -0,0 +1,59 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../../disclaimer-zh_CN.rst
+
+:Original: Documentation/admin-guide/LSM/apparmor.rst
+:翻译:
+ 朱岩 Yan Zhu <zhuyan2015@qq.com>
+
+
+========
+AppArmor
+========
+
+AppArmor 是什么?
+================
+
+AppArmor 是 Linux 内核的 MAC(强制访问控制)安全扩展。它实现了一个基于任务
+(profile)的策略,即从用户空间创建并加载任务的配置文件。系统中未为任务定义
+profile 的进程将在 unconfined(未限制)状态下运行,这相当于标准的 Linux DAC
+权限。
+
+如何启用和禁用
+==============
+
+设置 ``CONFIG_SECURITY_APPARMOR=y``
+
+如果希望将 AppArmor 设为默认的安全模块,请使用以下配置::
+
+   CONFIG_DEFAULT_SECURITY_APPARMOR=y
+
+``CONFIG_LSM`` 参数用于管理 LSM(Linux Security Module)的顺序和选择。请在列
+表中将 apparmor 指定为第一个 “主要” 模块(例如 AppArmor、SELinux、Smack)。
+
+构建内核
+--------
+
+如果 AppArmor 不是默认的安全模块,可以在内核命令行上添加
+``security=apparmor`` 来启用。
+
+如果 AppArmor 是默认的安全模块,则可以在内核命令行上添加
+``apparmor=0, security=XXXX``(其中 ``XXXX`` 为有效的安全模块)来禁用它。
+
+要让 AppArmor 在标准的 Linux DAC 权限之外施加任何限制,必须从用户空间向内核
+加载策略,具体请参阅文档和工具链接。
+
+文档
+====
+
+文档可在 wiki 上找到,链接如下。
+
+链接
+====
+
+Mailing List - apparmor@lists.ubuntu.com
+
+Wiki - http://wiki.apparmor.net
+
+User space tools - https://gitlab.com/apparmor
+
+Kernel module - git://git.kernel.org/pub/scm/linux/kernel/git/jj/linux-apparmor
-- 
2.43.0



^ permalink raw reply related

* [PATCH] Docs/mm/damon/design: fix a typo in the Address Unit section
From: Shardul Deshpande @ 2026-06-12 15:40 UTC (permalink / raw)
  To: SeongJae Park, Andrew Morton, David Hildenbrand, Lorenzo Stoakes,
	Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Jonathan Corbet, Shuah Khan,
	damon, linux-mm, linux-doc, linux-kernel

The "Address Unit" section misspelled the C type that the DAMON core
layer uses for monitoring target address ranges.  Correct it to read
"unsigned long".

Signed-off-by: Shardul Deshpande <iamsharduld@gmail.com>
---
 Documentation/mm/damon/design.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst
index afc7d52bd..899ac9c69 100644
--- a/Documentation/mm/damon/design.rst
+++ b/Documentation/mm/damon/design.rst
@@ -140,7 +140,7 @@ as Idle page tracking does.
 Address Unit
 ------------
 
-DAMON core layer uses ``unsinged long`` type for monitoring target address
+DAMON core layer uses ``unsigned long`` type for monitoring target address
 ranges.  In some cases, the address space for a given operations set could be
 too large to be handled with the type.  ARM (32-bit) with large physical
 address extension is an example.  For such cases, a per-operations set
-- 
2.43.0


^ permalink raw reply related

* RE: [PATCH v3 02/12] x86/resctrl: Add data structures and definitions for PLZA configuration
From: Luck, Tony @ 2026-06-12 15:40 UTC (permalink / raw)
  To: Chatre, Reinette, Babu Moger, corbet@lwn.net, Dave.Martin@arm.com,
	james.morse@arm.com, tglx@kernel.org, bp@alien8.de,
	dave.hansen@linux.intel.com
  Cc: skhan@linuxfoundation.org, x86@kernel.org, mingo@redhat.com,
	hpa@zytor.com, akpm@linux-foundation.org, rdunlap@infradead.org,
	pawan.kumar.gupta@linux.intel.com, feng.tang@linux.alibaba.com,
	dapeng1.mi@linux.intel.com, kees@kernel.org, elver@google.com,
	lirongqing@baidu.com, paulmck@kernel.org, bhelgaas@google.com,
	seanjc@google.com, alexandre.chartre@oracle.com,
	yazen.ghannam@amd.com, peterz@infradead.org, Bae, Chang Seok,
	kim.phillips@amd.com, xin@zytor.com, naveen@kernel.org,
	thomas.lendacky@amd.com, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, Eranian, Stephane,
	peternewman@google.com
In-Reply-To: <db9c0b3e-184c-4100-b59a-91f6e818fd31@intel.com>

> > diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> > index 9dc6b610e4e2..623628d3c643 100644
> > --- a/arch/x86/include/asm/msr-index.h
> > +++ b/arch/x86/include/asm/msr-index.h
> > @@ -1287,10 +1287,17 @@
> >  /* - AMD: */
> >  #define MSR_IA32_MBA_BW_BASE               0xc0000200
> >  #define MSR_IA32_SMBA_BW_BASE              0xc0000280
> > +#define MSR_IA32_PQR_PLZA_ASSOC            0xc00003fc
> >  #define MSR_IA32_L3_QOS_ABMC_CFG   0xc00003fd
> >  #define MSR_IA32_L3_QOS_EXT_CFG            0xc00003ff
> >  #define MSR_IA32_EVT_CFG_BASE              0xc0000400
> >
> > +/* Lower 32 bits of MSR_IA32_PQR_PLZA_ASSOC */
> > +#define RMID_EN                            BIT(31)
> > +/* Upper 32 bits of MSR_IA32_PQR_PLZA_ASSOC */
> > +#define CLOSID_EN                  BIT(15)
> > +#define PLZA_EN                            BIT(31)
> > +
>
> This is unexpected. So far resctrl has only defined the MSR numbers in this file, not
> the individual fields. This seems a legitimate use of msr-index.h but creates inconsistency
> with how the fields of the other resctrl registers are defined. This may be ok so I am
> looking past this for now. Since I am not familiar with this use I am looking at other
> patterns of this and it seems that the register fields are usually defined right after
> the register to make this relationship clear and also use more verbose naming to establish
> this relationship ... I do not think such cryptic names should be used without context
> in such a global scope. Please compare with how other fields are defined at this scope.

There's also patches in flight to treat MSRs as a single "u64" and move away from
the low level implementation detail that the RDMSR/WRMSR instructions split into
upper/lower halves.

All the kernel interfaces are moving to rdmsrq() and wrmsrq() (together with related
functions).

So maybe:

#define PQR_PLZA_RMID_EN        BIT_ULL(31)
#define PQR_PLZA_CLOSID_EN      BIT_ULL(47)
#define PQR_PLZA_PLZA_EN        BIT_ULL(63)

[modify with whatever addition prefix characters seem necessary]

-Tony


 

^ permalink raw reply

* Re: [PATCH v3 00/12] [PATCH v3 00/12] x86/resctrl: Add kernel-mode (e.g., PLZA) support to the resctrl subsystem
From: Moger, Babu @ 2026-06-12 15:37 UTC (permalink / raw)
  To: Reinette Chatre, Babu Moger, corbet, tony.luck, Dave.Martin,
	james.morse, tglx, bp, dave.hansen
  Cc: skhan, x86, mingo, hpa, akpm, rdunlap, pawan.kumar.gupta,
	feng.tang, dapeng1.mi, kees, elver, lirongqing, paulmck, bhelgaas,
	seanjc, alexandre.chartre, yazen.ghannam, peterz, chang.seok.bae,
	kim.phillips, xin, naveen, thomas.lendacky, linux-doc,
	linux-kernel, eranian, peternewman
In-Reply-To: <a1dbbb1a-ef78-468a-a80c-572a85220bbe@intel.com>



On 6/11/2026 4:53 PM, Reinette Chatre wrote:
> Hi Babu,
> 
> On 4/30/26 4:24 PM, Babu Moger wrote:
>> Design
>> ======
>>
>> A new sysfs file, info/kernel_mode, holds a single global policy that
>> selects what kernel work is steered and which rdtgroup it is steered
> 
> How should "selects *what* kernel work is steered" be interpreted? Do these
> modes not all apply to *all* kernel work?

How about?

A new sysfs file, info/kernel_mode, holds a single global policy for 
kernel contexts and the rdtgroup associated with the policy.

> 
>> to.  Reads describe the supported modes and the currently-active
>> binding; writes change the policy or rebind to a different group.
>> Look at the thread below for design discussion.
>> https://lore.kernel.org/lkml/14a8ad0a-e842-4268-871a-0762f1169e03@intel.com/
>>
> 
> ...
> 
>> Examples
>> ========
>>
>> (See Documentation/filesystems/resctrl.rst, "kernel_mode" and
>> "kmode_cpus" sections, for the full UAPI.)
>>
>>    # Mount resctrl
>>    # mount -t resctrl resctrl /sys/fs/resctrl
>>    # cd /sys/fs/resctrl
>>
>>    # Read the supported modes.  The active mode is bracketed and reports
>>    # the bound "<ctrl>/<mon>/" group; other supported modes report
>>    # ":group=none" because nothing is bound to them.
>>    # cat info/kernel_mode
>>    [inherit_ctrl_and_mon:group=//]
> 
> This is unexpected since associating a group to this mode implies that this
> group is used to manage allocations and monitoring of kernel work but this
> is not true, right? From what I understand there should be no group associated with
> this default "inherit_ctrl_and_mon" mode.

The default mode is "inherit_ctrl_and_mon", where both user mode and 
kernel mode share the same CLOSID and RMID. This is current mode 
(without this series).

I thought we are going to set the default mode with the default group 
when system boots up. No?


> 
>>    global_assign_ctrl_inherit_mon_per_cpu:group=none
>>    global_assign_ctrl_assign_mon_per_cpu:group=none
> 
> nit: "none" does not reflect state as clearly as "unset"/"uninitialized"/"NA"

Lets go with "uninitialized".

> 
>>
>>    # Create a CTRL_MON group plus a MON child and bind both the kernel
>>    # CLOSID and RMID to them.
>>    # mkdir ctrl1
>>    # mkdir ctrl1/mon_groups/mon1
>>    # echo "global_assign_ctrl_assign_mon_per_cpu:group=ctrl1/mon1/" \
>>            > info/kernel_mode
>>    # cat info/kernel_mode
>>    inherit_ctrl_and_mon:group=none
>>    global_assign_ctrl_inherit_mon_per_cpu:group=none
>>    [global_assign_ctrl_assign_mon_per_cpu:group=ctrl1/mon1/]
>>
>>    # kmode_cpus and kmode_cpus_list are visible only on the bound group.
>>    # ls ctrl1/kmode_cpus*
>>    ctrl1/kmode_cpus  ctrl1/kmode_cpus_list
> 
> Since it is ctrl1/mon1 that was bound, should these CPU files not appear
> in ctrl1/mon_groups/mon1 ?

Correct. Will fix it.


>>
>>    # Restrict the binding to a CPU subset; the write is incremental.
> 
> Does "incremental" mean that if the file contains CPUs 0-3 then writing
> "4" would set the CPUs to 0-4? This does not sound right since it is
> expected that user space can remove CPUs also?

Will remove incremental. Writing "4" will remove 0-3 and keep only 4.


> 
>>    # echo 0-3 > ctrl1/kmode_cpus_list
>>    # cat ctrl1/kmode_cpus
>>    f
>>    # cat ctrl1/kmode_cpus_list
>>    0-3
>>
>>    # Empty masks are rejected; use info/kernel_mode to reset to
>>    # "every online CPU".
>>    # echo "" > ctrl1/kmode_cpus_list
>>    bash: echo: write error: Invalid argument
>>    # cat info/last_cmd_status
>>    Empty mask not allowed; use info/kernel_mode to unbind
> 
> Why are empty masks rejected/not allowed?

No specific reason.

When the mode is switched, we discussed earlier to globally apply the 
mode to all the online CPUs.

At this point reading "kmode_cpus_list" will still report empty.

Users can change it to selectively apply the mode by writing to 
"kmode_cpus_list".

I was not sure what was the action when empty masks are written.

Should the empty mask apply the mode to all the online CPUs?


> 
>>
>>    # Disable kernel-mode steering (back to inherit, default group).
> 
> This sounds like kernel work is steered to default group which I
> do not think is accurate for the "inherit_ctrl_and_mon" mode.

How about ?

Drop the kernel-mode binding and restore inherit_ctrl_and_mon on the 
default group.

thanks
Babu



^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox