Linux Documentation
 help / color / mirror / Atom feed
* Re: [PATCH v8 29/46] KVM: selftests: Add selftests global for guest memory attributes capability
From: Fuad Tabba @ 2026-06-24 19:26 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, willy, wyihan,
	yan.y.zhao, forkloop, pratyush, suzuki.poulose, aneesh.kumar,
	liam, Paolo Bonzini, Sean Christopherson, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Jonathan Corbet, Shuah Khan, Shuah Khan, Vishal Annapurve,
	Andrew Morton, Chris Li, Kairui Song, Kemeng Shi, Nhat Pham,
	Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park,
	Qi Zheng, Shakeel Butt, Kiryl Shutsemau, Baoquan He,
	Jason Gunthorpe, Vlastimil Babka, kvm, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest, linux-mm,
	linux-coco
In-Reply-To: <20260618-gmem-inplace-conversion-v8-29-9d2959357853@google.com>

On Fri, 19 Jun 2026 at 01:32, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Sean Christopherson <seanjc@google.com>
>
> Add a global variable, kvm_has_gmem_attributes, to make the result of
> checking for KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES available to all tests.
>
> kvm_has_gmem_attributes is true if guest_memfd tracks memory attributes, as
> opposed to VM-level tracking.
>
> This global variable is synced to the guest for testing convenience, to
> avoid introducing subtle bugs when host/guest state is desynced.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Reviewed-by: Fuad Tabba <tabba@google.com>

Cheers,
/fuad

> ---
>  tools/testing/selftests/kvm/include/test_util.h | 2 ++
>  tools/testing/selftests/kvm/lib/kvm_util.c      | 5 +++++
>  2 files changed, 7 insertions(+)
>
> diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
> index a56271c237ae9..51287fac8138a 100644
> --- a/tools/testing/selftests/kvm/include/test_util.h
> +++ b/tools/testing/selftests/kvm/include/test_util.h
> @@ -115,6 +115,8 @@ struct guest_random_state {
>  extern u32 guest_random_seed;
>  extern struct guest_random_state guest_rng;
>
> +extern bool kvm_has_gmem_attributes;
> +
>  struct guest_random_state new_guest_random_state(u32 seed);
>  u32 guest_random_u32(struct guest_random_state *state);
>
> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
> index d5bbc80b2bf1c..b73817f7bc803 100644
> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
> @@ -24,6 +24,8 @@ u32 guest_random_seed;
>  struct guest_random_state guest_rng;
>  static u32 last_guest_seed;
>
> +bool kvm_has_gmem_attributes;
> +
>  static size_t vcpu_mmap_sz(void);
>
>  int __open_path_or_exit(const char *path, int flags, const char *enoent_help)
> @@ -521,6 +523,7 @@ struct kvm_vm *__vm_create(struct vm_shape shape, u32 nr_runnable_vcpus,
>         }
>         guest_rng = new_guest_random_state(guest_random_seed);
>         sync_global_to_guest(vm, guest_rng);
> +       sync_global_to_guest(vm, kvm_has_gmem_attributes);
>
>         kvm_arch_vm_post_create(vm, nr_runnable_vcpus);
>
> @@ -2286,6 +2289,8 @@ void __attribute((constructor)) kvm_selftest_init(void)
>         guest_random_seed = last_guest_seed = random();
>         pr_info("Random seed: 0x%x\n", guest_random_seed);
>
> +       kvm_has_gmem_attributes = kvm_has_cap(KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES);
> +
>         kvm_selftest_arch_init();
>  }
>
>
> --
> 2.55.0.rc0.738.g0c8ab3ebcc-goog
>
>

^ permalink raw reply

* Re: [PATCH v8 28/46] KVM: selftests: Add support for mmap() on guest_memfd in core library
From: Fuad Tabba @ 2026-06-24 19:07 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, willy, wyihan,
	yan.y.zhao, forkloop, pratyush, suzuki.poulose, aneesh.kumar,
	liam, Paolo Bonzini, Sean Christopherson, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Jonathan Corbet, Shuah Khan, Shuah Khan, Vishal Annapurve,
	Andrew Morton, Chris Li, Kairui Song, Kemeng Shi, Nhat Pham,
	Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park,
	Qi Zheng, Shakeel Butt, Kiryl Shutsemau, Baoquan He,
	Jason Gunthorpe, Vlastimil Babka, kvm, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest, linux-mm,
	linux-coco
In-Reply-To: <20260618-gmem-inplace-conversion-v8-28-9d2959357853@google.com>

On Fri, 19 Jun 2026 at 01:32, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Sean Christopherson <seanjc@google.com>
>
> Accept gmem_flags in vm_mem_add() to be able to create a guest_memfd within
> vm_mem_add().
>
> When vm_mem_add() is used to set up a guest_memfd for a memslot, set up the
> provided (or created) gmem_fd as the fd for the user memory region. This
> makes it available to be mmap()-ed from just like fds from other memory
> sources. mmap() from guest_memfd using the provided gmem_flags and
> gmem_offset.
>
> Add a kvm_slot_to_fd() helper to provide convenient access to the file
> descriptor of a memslot.
>
> Update existing callers of vm_mem_add() to pass 0 for gmem_flags to
> preserve existing behavior.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> [For guest_memfds, mmap() using gmem_offset instead of 0 all the time.]
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Reviewed-by: Fuad Tabba <tabba@google.com>

Cheers,
/fuad

> ---
>  tools/testing/selftests/kvm/include/kvm_util.h     |  7 +++++-
>  tools/testing/selftests/kvm/lib/kvm_util.c         | 27 ++++++++++++----------
>  .../kvm/x86/private_mem_conversions_test.c         |  2 +-
>  3 files changed, 22 insertions(+), 14 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
> index d4c104cb0418f..0cacf3698b259 100644
> --- a/tools/testing/selftests/kvm/include/kvm_util.h
> +++ b/tools/testing/selftests/kvm/include/kvm_util.h
> @@ -700,7 +700,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
>                                  gpa_t gpa, u32 slot, u64 npages, u32 flags);
>  void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
>                 gpa_t gpa, u32 slot, u64 npages, u32 flags,
> -               int gmem_fd, u64 gmem_offset);
> +               int gmem_fd, u64 gmem_offset, u64 gmem_flags);
>
>  #ifndef vm_arch_has_protected_memory
>  static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm)
> @@ -732,6 +732,11 @@ void *addr_gva2hva(struct kvm_vm *vm, gva_t gva);
>  gpa_t addr_hva2gpa(struct kvm_vm *vm, void *hva);
>  void *addr_gpa2alias(struct kvm_vm *vm, gpa_t gpa);
>
> +static inline int kvm_slot_to_fd(struct kvm_vm *vm, u32 slot)
> +{
> +       return memslot2region(vm, slot)->fd;
> +}
> +
>  #ifndef vcpu_arch_put_guest
>  #define vcpu_arch_put_guest(mem, val) do { (mem) = (val); } while (0)
>  #endif
> diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
> index 9b482778f7379..d5bbc80b2bf1c 100644
> --- a/tools/testing/selftests/kvm/lib/kvm_util.c
> +++ b/tools/testing/selftests/kvm/lib/kvm_util.c
> @@ -978,12 +978,13 @@ void vm_set_user_memory_region2(struct kvm_vm *vm, u32 slot, u32 flags,
>  /* FIXME: This thing needs to be ripped apart and rewritten. */
>  void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
>                 gpa_t gpa, u32 slot, u64 npages, u32 flags,
> -               int gmem_fd, u64 gmem_offset)
> +               int gmem_fd, u64 gmem_offset, u64 gmem_flags)
>  {
>         int ret;
>         struct userspace_mem_region *region;
>         size_t backing_src_pagesz = get_backing_src_pagesz(src_type);
>         size_t mem_size = npages * vm->page_size;
> +       off_t mmap_offset = 0;
>         size_t alignment = 1;
>
>         TEST_REQUIRE_SET_USER_MEMORY_REGION2();
> @@ -1055,8 +1056,6 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
>
>         if (flags & KVM_MEM_GUEST_MEMFD) {
>                 if (gmem_fd < 0) {
> -                       u32 gmem_flags = 0;
> -
>                         TEST_ASSERT(!gmem_offset,
>                                     "Offset must be zero when creating new guest_memfd");
>                         gmem_fd = vm_create_guest_memfd(vm, mem_size, gmem_flags);
> @@ -1077,13 +1076,17 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
>         }
>
>         region->fd = -1;
> -       if (backing_src_is_shared(src_type))
> +       if (flags & KVM_MEM_GUEST_MEMFD && gmem_flags & GUEST_MEMFD_FLAG_MMAP) {
> +               region->fd = kvm_dup(gmem_fd);
> +               mmap_offset = gmem_offset;
> +       } else if (backing_src_is_shared(src_type)) {
>                 region->fd = kvm_memfd_alloc(region->mmap_size,
>                                              src_type == VM_MEM_SRC_SHARED_HUGETLB);
> +       }
>
> -       region->mmap_start = kvm_mmap(region->mmap_size, PROT_READ | PROT_WRITE,
> -                                     vm_mem_backing_src_alias(src_type)->flag,
> -                                     region->fd);
> +       region->mmap_start = __kvm_mmap(region->mmap_size, PROT_READ | PROT_WRITE,
> +                                       vm_mem_backing_src_alias(src_type)->flag,
> +                                       region->fd, mmap_offset);
>
>         TEST_ASSERT(!is_backing_src_hugetlb(src_type) ||
>                     region->mmap_start == align_ptr_up(region->mmap_start, backing_src_pagesz),
> @@ -1129,10 +1132,10 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
>
>         /* If shared memory, create an alias. */
>         if (region->fd >= 0) {
> -               region->mmap_alias = kvm_mmap(region->mmap_size,
> -                                             PROT_READ | PROT_WRITE,
> -                                             vm_mem_backing_src_alias(src_type)->flag,
> -                                             region->fd);
> +               region->mmap_alias = __kvm_mmap(region->mmap_size,
> +                                               PROT_READ | PROT_WRITE,
> +                                               vm_mem_backing_src_alias(src_type)->flag,
> +                                               region->fd, mmap_offset);
>
>                 /* Align host alias address */
>                 region->host_alias = align_ptr_up(region->mmap_alias, alignment);
> @@ -1143,7 +1146,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm,
>                                  enum vm_mem_backing_src_type src_type,
>                                  gpa_t gpa, u32 slot, u64 npages, u32 flags)
>  {
> -       vm_mem_add(vm, src_type, gpa, slot, npages, flags, -1, 0);
> +       vm_mem_add(vm, src_type, gpa, slot, npages, flags, -1, 0, 0);
>  }
>
>  /*
> diff --git a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
> index 1d2f5d4fd45d7..861baff201e78 100644
> --- a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
> +++ b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
> @@ -399,7 +399,7 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, u32 nr_v
>         for (i = 0; i < nr_memslots; i++)
>                 vm_mem_add(vm, src_type, BASE_DATA_GPA + slot_size * i,
>                            BASE_DATA_SLOT + i, slot_size / vm->page_size,
> -                          KVM_MEM_GUEST_MEMFD, memfd, slot_size * i);
> +                          KVM_MEM_GUEST_MEMFD, memfd, slot_size * i, 0);
>
>         for (i = 0; i < nr_vcpus; i++) {
>                 gpa_t gpa =  BASE_DATA_GPA + i * per_cpu_size;
>
> --
> 2.55.0.rc0.738.g0c8ab3ebcc-goog
>
>

^ permalink raw reply

* Re: [PATCH v8 24/46] KVM: guest_memfd: Make in-place conversion the default
From: Fuad Tabba @ 2026-06-24 18:57 UTC (permalink / raw)
  To: ackerleytng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, willy, wyihan,
	yan.y.zhao, forkloop, pratyush, suzuki.poulose, aneesh.kumar,
	liam, Paolo Bonzini, Sean Christopherson, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, x86, H. Peter Anvin,
	Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	Jonathan Corbet, Shuah Khan, Shuah Khan, Vishal Annapurve,
	Andrew Morton, Chris Li, Kairui Song, Kemeng Shi, Nhat Pham,
	Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park,
	Qi Zheng, Shakeel Butt, Kiryl Shutsemau, Baoquan He,
	Jason Gunthorpe, Vlastimil Babka, kvm, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest, linux-mm,
	linux-coco
In-Reply-To: <20260618-gmem-inplace-conversion-v8-24-9d2959357853@google.com>

On Fri, 19 Jun 2026 at 01:31, Ackerley Tng via B4 Relay
<devnull+ackerleytng.google.com@kernel.org> wrote:
>
> From: Ackerley Tng <ackerleytng@google.com>
>
> Make in-place conversion the default if the arch has private mem.
>
> The default can be overridden at compile type by enabling

compile _time_

> CONFIG_KVM_VM_MEMORY_ATTRIBUTES, or at KVM load time through a module
> parameter.
>
> In-place conversion also implies tracking a guest's private/shared state in
> guest_memfd. To avoid inconsistencies in the way memory attributes are
> tracked between the per-VM or by guest_memfd, make the module_param
> read-only (0444).
>
> Document that using per-VM attributes for tracking private/shared state of
> guest memory is deprecated in favor of tracking in guest_memfd.
>
> Warn if the admin sets gmem_in_place_conversion as false when
> CONFIG_KVM_VM_MEMORY_ATTRIBUTES is not enabled. Add warning in the code
> path where guest memory is populated for a CoCo VM, since that's the
> earliest point in a CoCo VM's lifecycle where memory attributes are
> queried. Unlike other query sites, this site is exclusively used by CoCo
> VMs.
>
> Signed-off-by: Sean Christopherson <seanjc@google.com>

> ---
>  arch/x86/kvm/Kconfig   | 7 ++++++-
>  virt/kvm/guest_memfd.c | 5 +++++
>  virt/kvm/kvm_main.c    | 3 ++-
>  3 files changed, 13 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index c28393dc664eb..a3c189d765150 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -85,7 +85,12 @@ config KVM_VM_MEMORY_ATTRIBUTES
>         bool "Enable per-VM PRIVATE vs. SHARED attributes (for CoCo VMs)"
>         help
>           Enable support for tracking PRIVATE vs. SHARED memory using per-VM
> -         memory attributes.
> +         memory attributes.  Using per-VM attributes are deprecated in favor

nit:
are->is

Reviewed-by: Fuad Tabba <tabba@google.com>

Cheers,
/fuad





> +         of tracking PRIVATE state in guest_memfd.  Select this if you need
> +         to run CoCo VMs using a VMM that doesn't support guest_memfd memory
> +         attributes.
> +
> +         If unsure, say N.
>
>  config KVM_SW_PROTECTED_VM
>         bool "Enable support for KVM software-protected VMs"
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 86c9f5b0863cb..5cb73543c03c8 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -1193,10 +1193,15 @@ static bool kvm_gmem_range_is_private(struct file *file, pgoff_t index,
>  {
>         struct maple_tree *mt = &GMEM_I(file_inode(file))->attributes;
>
> +#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
>         if (!gmem_in_place_conversion)
>                 return kvm_range_has_vm_memory_attributes(kvm, gfn, gfn + nr_pages,
>                                                           KVM_MEMORY_ATTRIBUTE_PRIVATE,
>                                                           KVM_MEMORY_ATTRIBUTE_PRIVATE);
> +#else
> +       if (WARN_ON_ONCE(!gmem_in_place_conversion))
> +               return false;
> +#endif
>
>         return kvm_gmem_range_has_attributes(mt, index, nr_pages,
>                                              KVM_MEMORY_ATTRIBUTE_PRIVATE);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index dd1d18a1d2f68..46e92b5dc3804 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -102,7 +102,8 @@ static bool __ro_after_init allow_unsafe_mappings;
>  module_param(allow_unsafe_mappings, bool, 0444);
>
>  #ifdef kvm_arch_has_private_mem
> -bool __ro_after_init gmem_in_place_conversion = false;
> +bool __ro_after_init gmem_in_place_conversion = !IS_ENABLED(CONFIG_KVM_VM_MEMORY_ATTRIBUTES);
> +module_param(gmem_in_place_conversion, bool, 0444);
>  EXPORT_SYMBOL_FOR_KVM_INTERNAL(gmem_in_place_conversion);
>  #endif
>
>
> --
> 2.55.0.rc0.738.g0c8ab3ebcc-goog
>
>

^ permalink raw reply

* Re: [PATCH 1/2] cgroup/dmem: add per-region event counters
From: Tejun Heo @ 2026-06-24 18:52 UTC (permalink / raw)
  To: Hongfu Li
  Cc: hannes, mkoutny, corbet, skhan, dev, mripard, natalie.vock,
	cgroups, linux-doc, linux-kernel, dri-devel
In-Reply-To: <20260624031107.667253-2-lihongfu@kylinos.cn>

On Wed, Jun 24, 2026 at 11:11:06AM +0800, Hongfu Li wrote:
> Add dmem.events to report hierarchical low/max event counts per DMEM
> region.  Increment counters on dmem.max allocation failures and
> dmem.low protection events.  The file is available for non-root cgroups
> only.

Please don't double space in descs or comments. Also, maybe it's obvious but
it'd help if you list why and how this is useful. Why do we want to add
this?

> +  dmem.events
> +	A read-only file that reports the number of times each cgroup
> +	has hit its configured memory limits.  The format lists each
> +	region on a single line, followed by the event counters::
> +
> +	  drm/0000:03:00.0/vram0 low 0 max 3
> +	  drm/0000:03:00.0/stolen low 0 max 0

This isn't a supported file format. Please read the documentation on allowed
formats.

Thanks.

-- 
tejun

^ permalink raw reply

* Re: [PATCH v12 02/12] x86/bhi: Make clear_bhb_loop() effective on newer CPUs
From: Pawan Gupta @ 2026-06-24 17:49 UTC (permalink / raw)
  To: Nikolay Borisov
  Cc: x86, Jon Kohler, H. Peter Anvin, Josh Poimboeuf, David Kaplan,
	Sean Christopherson, Borislav Petkov, Dave Hansen, Peter Zijlstra,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, KP Singh,
	Jiri Olsa, David S. Miller, David Laight, Andy Lutomirski,
	Thomas Gleixner, Ingo Molnar, David Ahern, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	Stanislav Fomichev, Hao Luo, Paolo Bonzini, Jonathan Corbet,
	Jason Baron, Alice Ryhl, Steven Rostedt, Ard Biesheuvel,
	Shuah Khan, linux-kernel, kvm, Asit Mallick, Tao Zhang, bpf,
	netdev, linux-doc
In-Reply-To: <171efe97-fd87-45c1-9913-ff62eacab400@suse.com>

On Wed, Jun 24, 2026 at 03:12:28PM +0300, Nikolay Borisov wrote:
> 
> 
> On 23.06.26 г. 20:33 ч., Pawan Gupta wrote:
> > As a mitigation for BHI, clear_bhb_loop() executes branches that overwrite
> > the Branch History Buffer (BHB). On Alder Lake and newer parts this
> > sequence is not sufficient because it doesn't clear enough entries. This
> > was not an issue because these CPUs use the BHI_DIS_S hardware mitigation
> > in the kernel.
> > 
> > Now with VMSCAPE (BHI variant) it is also required to isolate branch
> > history between guests and userspace. Since BHI_DIS_S only protects the
> > kernel, the newer CPUs also use IBPB.
> > 
> > A cheaper alternative to the current IBPB mitigation is clear_bhb_loop().
> > But it currently does not clear enough BHB entries to be effective on newer
> > CPUs with larger BHB. At boot, dynamically set the loop count of
> > clear_bhb_loop() such that it is effective on newer CPUs too.
> > 
> > Introduce global loop counts, initializing them with appropriate value
> > based on the hardware feature X86_FEATURE_BHI_CTRL.
> > 
> > Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
> > Acked-by: Borislav Petkov (AMD) <bp@alien8.de>
> > Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
> 
> Reviewed-by: Nikolay Borisov <nik.borisov@suse.com>
> 
> Although AI brings up a valid argument about whether guests should be
> pessimized and fallback to the longer sequence ?

I don't disagree, but at the same time BHI mitigation for guest migration
is a different beast that should be addressed separately. A series that
adds virtual-SPEC_CTRL support is in the works. Expect the RFC to be posted
in a couple of weeks.

^ permalink raw reply

* Re: [PATCH v8 15/46] KVM: guest_memfd: Call arch invalidate hooks on conversion
From: Ackerley Tng @ 2026-06-24 17:46 UTC (permalink / raw)
  To: Sean Christopherson, Fuad Tabba
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, willy, wyihan,
	yan.y.zhao, forkloop, pratyush, suzuki.poulose, aneesh.kumar,
	liam, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <ajneQVLriUshjFIO@google.com>

Sean Christopherson <seanjc@google.com> writes:

> On Fri, Jun 19, 2026, Fuad Tabba wrote:
>> On Fri, 19 Jun 2026 at 01:31, Ackerley Tng via B4 Relay
>> <devnull+ackerleytng.google.com@kernel.org> wrote:
>> >
>> > From: Ackerley Tng <ackerleytng@google.com>
>> >
>> > When memory in guest_memfd is converted from private to shared, the
>> > platform-specific state associated with the guest-private pages must be
>> > invalidated or cleaned up.
>> >
>> > Iterate over the folios in the affected range and call the
>> > kvm_arch_gmem_invalidate() hook for each PFN range. This allows
>> > architectures to perform necessary teardown, such as updating hardware
>> > metadata or encryption states, before the pages are transitioned to the
>> > shared state.
>> >
>> > Invoke this helper after indicating to KVM's mmu code that an invalidation
>> > is in progress to stop in-flight page faults from succeeding.
>> >
>> > Reviewed-by: Fuad Tabba <tabba@google.com>
>> > Signed-off-by: Ackerley Tng <ackerleytng@google.com>
>>
>> Coming back to this after working through the arm64/pKVM side. My
>> Reviewed-by here is from the previous round and the patch hasn't
>> changed, but I missed an implication for arm64.
>>
>> kvm_arch_gmem_invalidate() is now called from two paths with the same
>> (start, end) signature: folio teardown (kvm_gmem_free_folio) and
>> private->shared conversion (here). For SNP/TDX that's fine, conversion is
>> destructive anyway. For pKVM the two need opposite content semantics:
>> conversion must preserve the page in place (same physical page, the point
>> of in-place conversion without encryption), while teardown must scrub it
>> before returning it to the host.
>>
>> The hook gets only a pfn range with no indication of which caller it's
>> serving, so arm64 can't give the two paths the behaviour they need. It
>> would help to signal intent on the conversion path: a reason/flag, a
>> separate hook, or not routing non-destructive conversion through the
>> teardown hook.
>>
>> arm64 isn't here yet, so this isn't urgent, but the hook is gaining a
>> second caller now, and it's cheaper to leave room for the distinction
>> than to change a generic contract other arches depend on later.
>
> Crud.  It may not be urgent for arm64, but it's urgent for other reasons that
> I "can't" describe in detail at the moment, and even if that weren't the case, I
> think we should clean things up now.  More below.
>
>> >  virt/kvm/guest_memfd.c | 41 +++++++++++++++++++++++++++++++++++++++++
>> >  1 file changed, 41 insertions(+)
>> >
>> > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
>> > index 433f79047b9d1..3c94442bc8131 100644
>> > --- a/virt/kvm/guest_memfd.c
>> > +++ b/virt/kvm/guest_memfd.c
>> > @@ -607,6 +607,42 @@ static bool kvm_gmem_is_safe_for_conversion(struct inode *inode, pgoff_t start,
>> >         return safe;
>> >  }
>> >
>> > +#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
>> > +static void kvm_gmem_invalidate(struct inode *inode, pgoff_t start, pgoff_t end)
>
> Not your fault, but kvm_arch_gmem_invalidate() is badly misnamed.  It's not
> "invalidating" anything, it's much more of a "free" callback, as SNP uses it to
> put physical pages back into a shared state when a maybe-private folio is freed.
>
> As Fuad points out, (ab)using that hook for the private=>shared conversion case
> "works", but not broadly.  And it makes the bad name worse, because it's called
> from code that _is_ doing true invalidations.  For pKVM, it may not even need to
> do anything invalidation-like.
>

Thanks, I also didn't like the naming of kvm_gmem_invalidate(),
especially when conversions also calls
kvm_gmem_invalidate_{start,end}() and those do different things.

> To avoid a conflict with patches that are going to have priority over this series,
> to set the stage for arm64 support, and to avoid avoid bleeding vendor details
> into guest_memfd, as if they are core guest_memfd behavior (only SNP needs the
> "invalidation" on this specific transition), I think we should add an arch hook
> to do conversions straightaway.
>
> Unless there's a clever option I'm missing, it'll mean adding yet another
> HAVE_KVM_ARCH_GMEM_XXX flag?  Hmm, especially because IIUC, arm64/pKVM doesn't
> need a callback for this case, only the free_folio case.
>
>> > +{
>> > +       struct folio_batch fbatch;
>> > +       pgoff_t next = start;
>> > +       int i;
>> > +
>> > +       folio_batch_init(&fbatch);
>> > +       while (filemap_get_folios(inode->i_mapping, &next, end - 1, &fbatch)) {
>> > +               for (i = 0; i < folio_batch_count(&fbatch); ++i) {
>> > +                       struct folio *folio = fbatch.folios[i];
>> > +                       pgoff_t start_index, end_index;
>> > +                       kvm_pfn_t start_pfn, end_pfn;
>> > +
>> > +                       start_index = max(start, folio->index);
>> > +                       end_index = min(end, folio_next_index(folio));
>> > +                       /*
>> > +                        * end_index is either in folio or points to
>> > +                        * the first page of the next folio. Hence,
>> > +                        * all pages in range [start_index, end_index)
>> > +                        * are contiguous.
>> > +                        */
>> > +                       start_pfn = folio_file_pfn(folio, start_index);
>> > +                       end_pfn = start_pfn + end_index - start_index;
>> > +
>> > +                       kvm_arch_gmem_invalidate(start_pfn, end_pfn);
>> > +               }
>> > +
>> > +               folio_batch_release(&fbatch);
>> > +               cond_resched();
>> > +       }
>> > +}
>> > +#else
>> > +static void kvm_gmem_invalidate(struct inode *inode, pgoff_t start, pgoff_t end) {}
>> > +#endif
>> > +
>> >  static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
>> >                                      size_t nr_pages, uint64_t attrs,
>> >                                      pgoff_t *err_index)
>> > @@ -647,7 +683,12 @@ static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
>> >          */
>> >
>> >         kvm_gmem_invalidate_start(inode, start, end);
>> > +
>> > +       if (!to_private)
>> > +               kvm_gmem_invalidate(inode, start, end);
>
> E.g. instead make this something like this?
>
> 	kvm_gmem_set_pfn_attributes(...)
>
> Hrm, though that wastes folio lookups in the to_private case.  So maybe just this,
> assuming pKVM doesn't need to take additional action on conversions?
>
> 	if (!to_private)
> 		kvm_gmem_make_shared(...)
>
> Actually, if we do that, then we don't need a separate arch hook, just a separate
> config.  It'll still bleed SNP details into guest_memfd, but it'll at least be
> done in a way that's more explicitly arch specific (and it's no different than
> what we already do for PREPARE...).
>

pKVM needs some arch guest_memfd lifecycle functions that

+ for conversion, doesn't do anything,
+ for teardown, resets page state (IIUC it'll be reset to
  PKVM_PAGE_OWNED (by the host))

So I think we need different functions for those two stages in the
lifecycle of a page with guest_memfd? What if we have

CONFIG_HAVE_KVM_ARCH_GMEM_SET_PFN_ATTRIBUTES, which gates

+ kvm_gmem_should_set_pfn_attributes(attributes) and
  .gmem_should_set_pfn_attributes
+ kvm_gmem_set_pfn_attributes(start_pfn, end_pfn, attributes) and
  .gmem_set_pfn_attributes

CONFIG_HAVE_KVM_ARCH_GMEM_TEARDOWN, which gates

+ kvm_gmem_teardown() and .gmem_teardown

SNP:

+ .gmem_should_set_pfn_attributes = sev_gmem_should_set_pfn_attributes,
  and sev_gmem_should_set_pfn_attributes returns !is_private
+ Rename .gmem_invalidate and sev_gmem_invalidate to *set_pfn_attributes
+ .gmem_teardown = sev_gmem_set_pfn_attributes

TDX:

+ Disable CONFIG_HAVE_KVM_ARCH_GMEM_SET_PFN_ATTRIBUTES
+ Disable CONFIG_HAVE_KVM_ARCH_GMEM_TEARDOWN

pKVM:

+ Disable CONFIG_HAVE_KVM_ARCH_GMEM_SET_PFN_ATTRIBUTES
+ .gmem_teardown = pkvm_gmem_set_pfn_attributes

Suzuki, does this work for ARM CCA?

This way,

+ The if (is_private) check doesn't leak SNP details into guest_memfd
+ .gmem_make_shared doesn't stick out without a .gmem_make_private
+ .gmem_set_pfn_attributes, .gmem_prepare and .gmem_teardown are aligned
  conceptually as lifecycle hooks

+ I think the private/shared check for prepare can also be folded into
  preparation.
    + Preparation perhaps doesn't need a should_prepare equivalent since
      there's no iteration and getting the gfn is just doing some math?
    + In another patch series?

> E.g. this?  There will still be a looming rename conflict, but that's easy enough
> to handle.
>
> diff --git virt/kvm/guest_memfd.c virt/kvm/guest_memfd.c
> index 9ce5be7843f2..8aead0abd788 100644
> --- virt/kvm/guest_memfd.c
> +++ virt/kvm/guest_memfd.c
> @@ -648,8 +648,8 @@ static bool kvm_gmem_is_safe_for_conversion(struct inode *inode, pgoff_t start,
>         return safe;
>  }
>
> -#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
> -static void kvm_gmem_invalidate(struct inode *inode, pgoff_t start, pgoff_t end)
> +#ifdef CONFIG_KVM_ARCH_GMEM_FREE_ON_SHARED_CONVERSION
> +static void kvm_gmem_make_shared(struct inode *inode, pgoff_t start, pgoff_t end)
>  {
>         struct folio_batch fbatch;
>         pgoff_t next = start;
> @@ -681,7 +681,7 @@ static void kvm_gmem_invalidate(struct inode *inode, pgoff_t start, pgoff_t end)
>         }
>  }
>  #else
> -static void kvm_gmem_invalidate(struct inode *inode, pgoff_t start, pgoff_t end) {}
> +static void kvm_gmem_make_shared(struct inode *inode, pgoff_t start, pgoff_t end) { }
>  #endif
>
>  static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
> @@ -729,7 +729,7 @@ static int __kvm_gmem_set_attributes(struct inode *inode, pgoff_t start,
>         kvm_gmem_invalidate_start(inode, start, end);
>
>         if (!to_private)
> -               kvm_gmem_invalidate(inode, start, end);
> +               kvm_gmem_make_shared(inode, start, end);
>
>         mas_store_prealloc(&mas, xa_mk_value(attrs));

^ permalink raw reply

* Re: [PATCH v17 01/28] drm/amd/display: Remove unnecessary SIGNAL_TYPE_HDMI_TYPE_A check
From: Harry Wentland @ 2026-06-24 17:15 UTC (permalink / raw)
  To: Nicolas Frattaroli, Leo Li, Rodrigo Siqueira, Alex Deucher,
	Christian König, David Airlie, Simona Vetter,
	Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann,
	Andrzej Hajda, Neil Armstrong, Robert Foss, Laurent Pinchart,
	Jonas Karlman, Jernej Skrabec, Sandy Huang, Heiko Stübner,
	Andy Yan, Jani Nikula, Rodrigo Vivi, Joonas Lahtinen,
	Tvrtko Ursulin, Dmitry Baryshkov, Sascha Hauer, Rob Herring,
	Jonathan Corbet, Shuah Khan, Daniel Stone
  Cc: kernel, amd-gfx, dri-devel, linux-kernel, linux-arm-kernel,
	linux-rockchip, intel-gfx, intel-xe, linux-doc, wayland-devel,
	Werner Sembach, Andri Yngvason
In-Reply-To: <20260609-color-format-v17-1-35739b5782cc@collabora.com>

On 2026-06-09 08:43, Nicolas Frattaroli wrote:
> From: Werner Sembach <wse@tuxedocomputers.com>
> 
> Remove unnecessary SIGNAL_TYPE_HDMI_TYPE_A check that was performed in the
> drm_mode_is_420_only() case, but not in the drm_mode_is_420_also() &&
> force_yuv420_output case.
> 
> Without further knowledge if YCbCr 4:2:0 is supported outside of HDMI,
> there is no reason to use RGB when the display
> reports drm_mode_is_420_only() even on a non HDMI connection.
> 
> This patch also moves both checks in the same if-case. This  eliminates an
> extra else-if-case.
> 
> Signed-off-by: Werner Sembach <wse@tuxedocomputers.com>
> Signed-off-by: Andri Yngvason <andri@yngvason.is>
> Tested-by: Andri Yngvason <andri@yngvason.is>
> Reviewed-by: Daniel Stone <daniel@fooishbar.org>
> Signed-off-by: Nicolas Frattaroli <nicolas.frattaroli@collabora.com>

Reviewed-by: Harry Wentland <harry.wentland@amd.com>

Harry

> ---
>  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 9 +++------
>  1 file changed, 3 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index ba7f98a87808..dfe97897127c 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -6917,12 +6917,9 @@ static void fill_stream_properties_from_drm_display_mode(
>  	timing_out->v_border_top = 0;
>  	timing_out->v_border_bottom = 0;
>  	/* TODO: un-hardcode */
> -	if (drm_mode_is_420_only(info, mode_in)
> -			&& stream->signal == SIGNAL_TYPE_HDMI_TYPE_A)
> -		timing_out->pixel_encoding = PIXEL_ENCODING_YCBCR420;
> -	else if (drm_mode_is_420_also(info, mode_in)
> -			&& aconnector
> -			&& aconnector->force_yuv420_output)
> +	if (drm_mode_is_420_only(info, mode_in) ||
> +	    (aconnector && aconnector->force_yuv420_output &&
> +	     drm_mode_is_420_also(info, mode_in)))
>  		timing_out->pixel_encoding = PIXEL_ENCODING_YCBCR420;
>  	else if ((connector->display_info.color_formats & BIT(DRM_OUTPUT_COLOR_FORMAT_YCBCR422))
>  			&& aconnector
> 


^ permalink raw reply

* Re: [PATCH v8 18/46] KVM: guest_memfd: Handle lru_add fbatch refcounts during conversion safety check
From: Sean Christopherson @ 2026-06-24 17:01 UTC (permalink / raw)
  To: Binbin Wu
  Cc: ackerleytng, aik, andrew.jones, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <6fc7f450-6d0a-494d-b295-297e4703148d@linux.intel.com>

On Tue, Jun 23, 2026, Binbin Wu wrote:
> On 6/19/2026 8:31 AM, Ackerley Tng via B4 Relay wrote:
> > @@ -606,12 +608,20 @@ static bool kvm_gmem_is_safe_for_conversion(struct inode *inode, pgoff_t start,
> >  	next = start;
> >  	while (safe && filemap_get_folios(mapping, &next, last, &fbatch)) {
> >  
> > -		for (i = 0; i < folio_batch_count(&fbatch); ++i) {
> > +		for (i = 0; i < folio_batch_count(&fbatch);) {
> >  			struct folio *folio = fbatch.folios[i];
> >  
> > -			if (folio_ref_count(folio) !=
> > -			    folio_nr_pages(folio) + filemap_get_folios_refcount) {
> > -				safe = false;
> > +			safe = (folio_ref_count(folio) ==
> > +				folio_nr_pages(folio) +
> > +				filemap_get_folios_refcount);
> > +
> > +			if (safe) {
> > +				++i;
> > +			} else if (folio_may_be_lru_cached(folio) &&
> > +				   !lru_drained) {
> > +				lru_add_drain_all();
> 
> It seems unprivileged userspace is able to trigger lru_add_drain_all() repeatedly
> by invoking KVM_SET_MEMORY_ATTRIBUTES2 in a loop, which could lead to DoS risk?

FIW, if there's a risk, then AFAICT fadvise() and memfd's F_ADD_SEALS already
have the same risk.

^ permalink raw reply

* Re: [PATCH v4 2/5] mm/zswap: Factor writeback loop out of shrink_worker()
From: Yosry Ahmed @ 2026-06-24 17:00 UTC (permalink / raw)
  To: Hao Jia
  Cc: akpm, tj, hannes, shakeel.butt, mhocko, mkoutny, nphamcs,
	chengming.zhou, muchun.song, roman.gushchin, linux-mm,
	linux-kernel, linux-doc, Hao Jia
In-Reply-To: <0916e673-861f-b472-7417-afbffbcc98ad@gmail.com>

On Wed, Jun 24, 2026 at 4:55 AM Hao Jia <jiahao.kernel@gmail.com> wrote:
>
>
>
> On 2026/6/23 07:36, Yosry Ahmed wrote:
> >> +/*
> >> + * Walk the memcg tree and write back zswap pages until the
> >> + * (lower_pages, upper_pages) window closes, or abort encounter
> >> + * MAX_RECLAIM_RETRIES times of the following conditions:
> >> + * - No writeback-candidate memcgs found in a memcg tree walk.
> >> + * - Shrinking a writeback-candidate memcg failed.
> >> + *
> >> + * For shrink_worker(), it passes lower=thr and upper=zswap_total_pages().
> >> + * The @upper limit is refreshed in each iteration by re-evaluating
> >> + * zswap_total_pages(), and the window closes once the total falls
> >> + * below the threshold.
> >
> > This is the wrong abstraction level, and it's obvious by the fact that
> > the function calls zswap_total_pages() again to recalcualte
> > 'upper_pages'. It gets much worse in the next patch as well.
> >
> > The lower_pages and upper_pages thing is also unnecessarily hard to
> > follow.
> >
> > The core of the reuse here is the retry logic. So maybe keep the memcg
> > iteration in the callers, and define a function that takes in one memcg
> > and reclaims one batch from it? failures and attempts can be passed into
> > the function to maintain the state across scans of different memcgs,
> > like zswap_shrink_walk_arg?
> >
> > WDYT?
>
>
> Perhaps something like this?
>
> struct zswap_shrink_state {
>      int attempts;
>      int failures;
>      bool stop;
> };
>
> static bool zswap_shrink_no_candidate(struct zswap_shrink_state *s)
> {
>      if (!s->attempts && ++s->failures == MAX_RECLAIM_RETRIES)
>          return true;
>
>      s->attempts = 0;
>      return false;
> }
>
> static long zswap_shrink_one(struct mem_cgroup *memcg,
>                   struct zswap_shrink_state *s)
> {
>      long shrunk;
>
>      shrunk = shrink_memcg(memcg, NR_ZSWAP_WB_BATCH);
>      if (shrunk == -ENOENT)
>          return 0;
>
>      s->attempts++;
>      if (shrunk <= 0 && ++s->failures == MAX_RECLAIM_RETRIES)
>          s->stop = true;

Do we need 'stop' or can we just return a value here to indicate that
we should stop (e.g. -EBUSY)?

>
>      return shrunk;
> }
>
> static void shrink_worker(struct work_struct *w)
> {
>      struct zswap_shrink_state s = {};
>      unsigned long thr;
>
>      /* Reclaim down to the accept threshold */
>      thr = zswap_accept_thr_pages();
>
>      while (zswap_total_pages() > thr) {
>          struct mem_cgroup *memcg;
>
>          cond_resched();
>
>          memcg = zswap_iter_global();
>          if (!memcg) {
>              if (zswap_shrink_no_candidate(&s))
>                  break;
>              continue;
>          }
>
>          zswap_shrink_one(memcg, &s);
>          /* Drop the extra reference taken by the iterator. */
>          mem_cgroup_put(memcg);
>          if (s.stop)
>              break;
>      }
> }
>
> We could also fold the logic of zswap_shrink_no_candidate() into
> zswap_shrink_one(), but adding a !memcg check inside zswap_shrink_one()
> feels a bit awkward.
>
> WDYT?

I think splitting the shrink/retry logic over 2 functions makes it
more difficult to follow, so yeah I think fold
zswap_shrink_no_candidate() into zswap_shrink_one(). Then the callers
only need to iterate memcgs (depending on the context) and call
zswap_shrink_one() for each of them.

^ permalink raw reply

* Re: [PATCH v4 1/5] mm/zswap: Extend shrink_memcg() writeback capability
From: Yosry Ahmed @ 2026-06-24 16:57 UTC (permalink / raw)
  To: Hao Jia
  Cc: akpm, tj, hannes, shakeel.butt, mhocko, mkoutny, nphamcs,
	chengming.zhou, muchun.song, roman.gushchin, linux-mm,
	linux-kernel, linux-doc, Hao Jia
In-Reply-To: <057ea303-4c27-1a6e-08de-cce26c699097@gmail.com>

>
> /*
>   * Scan up to @nr_to_scan pages across the per-node zswap LRUs of @memcg
>   * and write back the reclaimable ones.
>   *
>   * Since the second-chance algorithm rotates referenced entries to the
>   * LRU tail, the per-node scan is capped at the current LRU length so
>   * each entry is scanned at most once per call. It is up to the caller
>   * to handle retries, deciding whether to scan the next memcg to complete

Nit: "whether to scan another memcg to complete.."

>   * the full iteration, or to rescan the current memcg to drain its zswap
>   * entries.
>   *
>   * Return: The number of compressed bytes written back (>= 0), or -ENOENT
>   * if @memcg has writeback disabled, is a zombie cgroup, or has empty
>   * zswap LRUs.
>   */
> static long shrink_memcg(struct mem_cgroup *memcg, unsigned long nr_to_scan)
> {
>      struct zswap_shrink_walk_arg walk_arg = {
>          .bytes_written = 0,
>          .encountered_page_in_swapcache = false,
>      };
>      unsigned long nr_remaining = nr_to_scan;
>      int nid;
>
>      if (!mem_cgroup_zswap_writeback_enabled(memcg))
>          return -ENOENT;
>
>      /*
>       * Skip zombies because their LRUs are reparented and we would be
>       * reclaiming from the parent instead of the dead memcg.
>       */
>      if (memcg && !mem_cgroup_online(memcg))
>          return -ENOENT;
>
>      for_each_node_state(nid, N_NORMAL_MEMORY) {
>          unsigned long nr_to_walk;
>
>          /*
>           * Cap the walk at the current LRU length to ensure each entry is
>           * scanned at most once per call. Referenced entries are rotated
>           * to the tail for a second chance, and this bound prevents them
>           * from being revisited within a single call. Retries are left to
>           * the caller, which can choose to rescan the current memcg or
>           * move on to the next one.
>           */

Nit: Make this more concise since it's already explained above.

Otherwise this looks good to me, thank you!

>          nr_to_walk = min(nr_remaining,
>                   list_lru_count_one(&zswap_list_lru, nid, memcg));
>          if (!nr_to_walk)
>              continue;
>
>          nr_remaining -= nr_to_walk;
>          list_lru_walk_one(&zswap_list_lru, nid, memcg, &shrink_memcg_cb,
>                    &walk_arg, &nr_to_walk);
>          /* Return the unused share of the budget to the pool. */
>          nr_remaining += nr_to_walk;
>
>          if (!nr_remaining)
>              break;
>      }
>
>      /* Nothing was scanned: every LRU under @memcg was empty. */
>      if (nr_remaining == nr_to_scan)
>          return -ENOENT;
>
>      return walk_arg.bytes_written;
> }
>
>
> Thanks,
> Hao

^ permalink raw reply

* Re: [PATCH v8 18/46] KVM: guest_memfd: Handle lru_add fbatch refcounts during conversion safety check
From: Sean Christopherson @ 2026-06-24 16:57 UTC (permalink / raw)
  To: Ackerley Tng
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <20260618-gmem-inplace-conversion-v8-18-9d2959357853@google.com>

On Thu, Jun 18, 2026, Ackerley Tng wrote:
> When checking if a guest_memfd folio is safe for conversion, its refcount
> is examined. A folio may be present in a per-CPU lru_add fbatch, which
> temporarily increases its refcount. 

Under what circumstances does this happen, and what alternatives are there for
userspace to work around the issue?

^ permalink raw reply

* Re: [PATCH v3 1/2] dt-bindings: iio: dac: Add AD5529R
From: Conor Dooley @ 2026-06-24 16:35 UTC (permalink / raw)
  To: Janani Sunil
  Cc: Jonathan Cameron, Rodrigo Alencar, Nuno Sá, Janani Sunil,
	Lars-Peter Clausen, Michael Hennerich, David Lechner,
	Nuno Sá, Andy Shevchenko, Rob Herring, Krzysztof Kozlowski,
	Conor Dooley, Philipp Zabel, Jonathan Corbet, Shuah Khan,
	linux-iio, devicetree, linux-kernel, linux-doc, Mark Brown
In-Reply-To: <20260624-decrease-protector-6c883bd1ac47@spud>

[-- Attachment #1: Type: text/plain, Size: 974 bytes --]

On Wed, Jun 24, 2026 at 05:32:26PM +0100, Conor Dooley wrote:

> >     dac@0 {
> >         compatible = "adi,ad5529r-16";
> >         reg = <0>;
> >         adi,device-addrs = <0 1>;
> 
> I think this should be put in the channel itself and made generic.

I guess I should expand on that, putting it in the channel means it's
not tied to some device-specific knowledge about when each device
address is used.
It should be generic because there are at least 3 devices, from 2
different vendors, that we know of, using the exact same scheme.

> >         reset-gpios = <&gpio0 87 GPIO_ACTIVE_LOW>;
> >         vdd-supply  = <&vdd_reg>;
> >         hvdd-supply = <&hvdd_reg>;
> > 
> >         channel@0  { reg = <0>;  adi,output-range-microvolt = <0 5000000>; };
> >         channel@16 { reg = <16>; adi,output-range-microvolt = <0 40000000>; };
> >     };
> > 
> > Does this look reasonable to everyone?
> > 
> > Regards,
> > Janani Sunil
> > 



[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* Re: [PATCH v3 1/2] dt-bindings: iio: dac: Add AD5529R
From: Conor Dooley @ 2026-06-24 16:32 UTC (permalink / raw)
  To: Janani Sunil
  Cc: Jonathan Cameron, Rodrigo Alencar, Nuno Sá, Janani Sunil,
	Lars-Peter Clausen, Michael Hennerich, David Lechner,
	Nuno Sá, Andy Shevchenko, Rob Herring, Krzysztof Kozlowski,
	Conor Dooley, Philipp Zabel, Jonathan Corbet, Shuah Khan,
	linux-iio, devicetree, linux-kernel, linux-doc, Mark Brown
In-Reply-To: <9abc53d0-432f-48fc-9e21-4d9a3c5e129f@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 22849 bytes --]

On Wed, Jun 24, 2026 at 05:01:18PM +0200, Janani Sunil wrote:
> 
> On 6/23/26 16:57, Jonathan Cameron wrote:
> > On Tue, 23 Jun 2026 09:09:14 +0100
> > Rodrigo Alencar <455.rodrigo.alencar@gmail.com> wrote:
> > 
> > > On 22/06/26 13:20, Nuno Sá wrote:
> > > > On Mon, Jun 22, 2026 at 12:51:20PM +0100, Rodrigo Alencar wrote:
> > > > > On 22/06/26 11:29, Nuno Sá wrote:
> > > > > > On Mon, Jun 22, 2026 at 10:24:05AM +0100, Rodrigo Alencar wrote:
> > > > > > > On 21/06/26 15:33, Jonathan Cameron wrote:
> > > > > > > > On Fri, 19 Jun 2026 16:54:11 +0100
> > > > > > > > Nuno Sá <noname.nuno@gmail.com> wrote:
> > > > > > > > > On Fri, Jun 19, 2026 at 03:12:07PM +0100, Conor Dooley wrote:
> > > > > > > > > > On Fri, Jun 19, 2026 at 02:01:08PM +0100, Nuno Sá wrote:
> > > > > > > > > > > On Fri, Jun 19, 2026 at 12:40:54PM +0100, Conor Dooley wrote:
> > > > > > > > > > > > On Fri, Jun 19, 2026 at 12:36:55PM +0100, Conor Dooley wrote:
> > > > > > > > > > > > > On Fri, Jun 19, 2026 at 12:33:11PM +0200, Janani Sunil wrote:
> > > > > > > > > > > > > > On 6/14/26 21:44, Jonathan Cameron wrote:
> > > > > > > > > > > > > > > On Tue, 9 Jun 2026 16:47:23 +0200
> > > > > > > > > > > > > > > Janani Sunil <jan.sun97@gmail.com> wrote:
> > > > > > > > > > > > > > > > On 5/26/26 15:11, Rodrigo Alencar wrote:
> > > > > > > > > > > > > > > > > On 26/05/19 05:42PM, Janani Sunil wrote:
> > > > > > > > > > > > > > > > > > Devicetree bindings for AD5529R 16 channel 12/16 bit high voltage,
> > > > > > > > > > > > > > > > > > buffered voltage output digital-to-analog converter (DAC) with an
> > > > > > > > > > > > > > > > > > integrated precision reference.
> > > > > > > > > > > > > > > > > ...
> > > > > > > > > > > > > > > > > Probably others may comment on that, but...
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > This parent node may support device addressing for multi-device support through
> > > > > > > > > > > > > > > > > those ID pins. I suppose that each device may have its own power supplies or
> > > > > > > > > > > > > > > > > other resources like the toggle pins or reset and enable.
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > That way I suppose that an example would look like...
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +patternProperties:
> > > > > > > > > > > > > > > > > > +  "^channel@([0-9]|1[0-5])$":
> > > > > > > > > > > > > > > > > > +    type: object
> > > > > > > > > > > > > > > > > > +    description: Child nodes for individual channel configuration
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +    properties:
> > > > > > > > > > > > > > > > > > +      reg:
> > > > > > > > > > > > > > > > > > +        description: Channel number.
> > > > > > > > > > > > > > > > > > +        minimum: 0
> > > > > > > > > > > > > > > > > > +        maximum: 15
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +      adi,output-range-microvolt:
> > > > > > > > > > > > > > > > > > +        description: |
> > > > > > > > > > > > > > > > > > +          Output voltage range for this channel as [min, max] in microvolts.
> > > > > > > > > > > > > > > > > > +          If not specified, defaults to 0V to 5V range.
> > > > > > > > > > > > > > > > > > +        oneOf:
> > > > > > > > > > > > > > > > > > +          - items:
> > > > > > > > > > > > > > > > > > +              - const: 0
> > > > > > > > > > > > > > > > > > +              - enum: [5000000, 10000000, 20000000, 40000000]
> > > > > > > > > > > > > > > > > > +          - items:
> > > > > > > > > > > > > > > > > > +              - const: -5000000
> > > > > > > > > > > > > > > > > > +              - const: 5000000
> > > > > > > > > > > > > > > > > > +          - items:
> > > > > > > > > > > > > > > > > > +              - const: -10000000
> > > > > > > > > > > > > > > > > > +              - const: 10000000
> > > > > > > > > > > > > > > > > > +          - items:
> > > > > > > > > > > > > > > > > > +              - const: -15000000
> > > > > > > > > > > > > > > > > > +              - const: 15000000
> > > > > > > > > > > > > > > > > > +          - items:
> > > > > > > > > > > > > > > > > > +              - const: -20000000
> > > > > > > > > > > > > > > > > > +              - const: 20000000
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +    required:
> > > > > > > > > > > > > > > > > > +      - reg
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +    additionalProperties: false
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +required:
> > > > > > > > > > > > > > > > > > +  - compatible
> > > > > > > > > > > > > > > > > > +  - reg
> > > > > > > > > > > > > > > > > > +  - vdd-supply
> > > > > > > > > > > > > > > > > > +  - avdd-supply
> > > > > > > > > > > > > > > > > > +  - hvdd-supply
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +dependencies:
> > > > > > > > > > > > > > > > > > +  spi-cpha: [ spi-cpol ]
> > > > > > > > > > > > > > > > > > +  spi-cpol: [ spi-cpha ]
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +allOf:
> > > > > > > > > > > > > > > > > > +  - $ref: /schemas/spi/spi-peripheral-props.yaml#
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +unevaluatedProperties: false
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +examples:
> > > > > > > > > > > > > > > > > > +  - |
> > > > > > > > > > > > > > > > > > +    #include <dt-bindings/gpio/gpio.h>
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +    spi {
> > > > > > > > > > > > > > > > > > +        #address-cells = <1>;
> > > > > > > > > > > > > > > > > > +        #size-cells = <0>;
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +        dac@0 {
> > > > > > > > > > > > > > > > > > +            compatible = "adi,ad5529r-16";
> > > > > > > > > > > > > > > > > > +            reg = <0>;
> > > > > > > > > > > > > > > > > > +            spi-max-frequency = <25000000>;
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +            vdd-supply = <&vdd_regulator>;
> > > > > > > > > > > > > > > > > > +            avdd-supply = <&avdd_regulator>;
> > > > > > > > > > > > > > > > > > +            hvdd-supply = <&hvdd_regulator>;
> > > > > > > > > > > > > > > > > > +            hvss-supply = <&hvss_regulator>;
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +            reset-gpios = <&gpio0 87 GPIO_ACTIVE_LOW>;
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +            #address-cells = <1>;
> > > > > > > > > > > > > > > > > > +            #size-cells = <0>;
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +            channel@0 {
> > > > > > > > > > > > > > > > > > +                reg = <0>;
> > > > > > > > > > > > > > > > > > +                adi,output-range-microvolt = <0 5000000>;
> > > > > > > > > > > > > > > > > > +            };
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +            channel@1 {
> > > > > > > > > > > > > > > > > > +                reg = <1>;
> > > > > > > > > > > > > > > > > > +                adi,output-range-microvolt = <(-10000000) 10000000>;
> > > > > > > > > > > > > > > > > > +            };
> > > > > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > > > > +            channel@2 {
> > > > > > > > > > > > > > > > > > +                reg = <2>;
> > > > > > > > > > > > > > > > > > +                adi,output-range-microvolt = <0 40000000>;
> > > > > > > > > > > > > > > > > > +            };
> > > > > > > > > > > > > > > > > > +        };
> > > > > > > > > > > > > > > > > > +    };
> > > > > > > > > > > > > > > > > ...
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > 	spi {
> > > > > > > > > > > > > > > > > 		#address-cells = <1>;
> > > > > > > > > > > > > > > > > 		#size-cells = <0>;
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > 		multi-dac@0 {
> > > > > > > > > > > > > > > > > 			compatible = "adi,ad5529r-16";
> > > > > > > > > > > > > > > > > 			reg = <0>;
> > > > > > > > > > > > > > > > > 			spi-max-frequency = <25000000>;
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > 			#address-cells = <1>;
> > > > > > > > > > > > > > > > > 			#size-cells = <0>;
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > 			dac@0 {
> > > > > > > > > > > > > > > > > 				reg = <0>;
> > > > > > > > > > > > > > > > > 				vdd-supply = <&vdd_regulator>;
> > > > > > > > > > > > > > > > > 				avdd-supply = <&avdd_regulator>;
> > > > > > > > > > > > > > > > > 				hvdd-supply = <&hvdd_regulator>;
> > > > > > > > > > > > > > > > > 				hvss-supply = <&hvss_regulator>;
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > 				reset-gpios = <&gpio0 87 GPIO_ACTIVE_LOW>;
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > 				#address-cells = <1>;
> > > > > > > > > > > > > > > > > 				#size-cells = <0>;
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > 				channel@0 {
> > > > > > > > > > > > > > > > > 					reg = <0>;
> > > > > > > > > > > > > > > > > 					adi,output-range-microvolt = <0 5000000>;
> > > > > > > > > > > > > > > > > 				};
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > 				channel@1 {
> > > > > > > > > > > > > > > > > 					reg = <1>;
> > > > > > > > > > > > > > > > > 					adi,output-range-microvolt = <(-10000000) 10000000>;
> > > > > > > > > > > > > > > > > 				};
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > 				channel@2 {
> > > > > > > > > > > > > > > > > 					reg = <2>;
> > > > > > > > > > > > > > > > > 					adi,output-range-microvolt = <0 40000000>;
> > > > > > > > > > > > > > > > > 				};
> > > > > > > > > > > > > > > > > 			}
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > 			dac@1 {
> > > > > > > > > > > > > > > > > 				reg = <1>;
> > > > > > > > > > > > > > > > > 				vdd-supply = <&vdd_regulator>;
> > > > > > > > > > > > > > > > > 				avdd-supply = <&avdd_regulator>;
> > > > > > > > > > > > > > > > > 				hvdd-supply = <&hvdd_regulator>;
> > > > > > > > > > > > > > > > > 				hvss-supply = <&hvss_regulator>;
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > 				reset-gpios = <&gpio0 88 GPIO_ACTIVE_LOW>;
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > 				#address-cells = <1>;
> > > > > > > > > > > > > > > > > 				#size-cells = <0>;
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > 				channel@0 {
> > > > > > > > > > > > > > > > > 					reg = <0>;
> > > > > > > > > > > > > > > > > 					adi,output-range-microvolt = <0 5000000>;
> > > > > > > > > > > > > > > > > 				};
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > 				channel@1 {
> > > > > > > > > > > > > > > > > 					reg = <1>;
> > > > > > > > > > > > > > > > > 					adi,output-range-microvolt = <(-10000000) 10000000>;
> > > > > > > > > > > > > > > > > 				};
> > > > > > > > > > > > > > > > > 			}
> > > > > > > > > > > > > > > > > 		};
> > > > > > > > > > > > > > > > > 	};
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > then you might need something like:
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > 	patternProperties:
> > > > > > > > > > > > > > > > > 		"^dac@[0-3]$":
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > and put most of the things under this node pattern.
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > So the main driver that you're putting together might need to handle up to four instances.
> > > > > > > > > > > > > > > > > Even if your current driver cannot handle this, the dt-bindings might need cover that.
> > > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > > Need to double check if each dac node needs a separate compatible, so you would maybe populate
> > > > > > > > > > > > > > > > > a platform data to be shared with the child nodes, which would be a separate driver.
> > > > > > > > > > > > > > > > > (not sure if it would make sense to mix and match ad5529r-16 and ad5529r-12).
> > > > > > > > > > > > > > > > Hi Rodrigo,
> > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > Thank you for looking at this.
> > > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > > For now, I would prefer to keep the binding scoped to a single AD5529R device instance. The current
> > > > > > > > > > > > > > > > hardware/use case we have only needs one device node and the driver is written around that model as well.
> > > > > > > > > > > > > > > > While the device addressing pins could allow multi-device topology, we do not have an actual platform using
> > > > > > > > > > > > > > > > that configuration at the moment, so I would prefer not to introduce an extra parent/child binding structure
> > > > > > > > > > > > > > > > speculatively without a validating use case.
> > > > > > > > > > > > > > > Interesting feature - kind of similar to address control on a typical i2c bus device, or
> > > > > > > > > > > > > > > looking at it another way a kind of distributed SPI mux.
> > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > Challenge of a binding is we need to anticipate the future.  So I think we do need something
> > > > > > > > > > > > > > > like Rodrigo is suggesting even if we only (for now) support a single instance in the driver.
> > > > > > > > > > > > > > > That would leave the path open to supporting the addressing at a later date.
> > > > > > > > > > > > > > > An alternative might be to look at it like a chained device setup. In those we pretend there
> > > > > > > > > > > > > > > is just one device with a lot of channels etc.  The snag is that here things are more loosely
> > > > > > > > > > > > > > > coupled whereas for those devices it tends to be you have to read / write the same register
> > > > > > > > > > > > > > > in all devices in the chain as one big SPI message.
> > > > > > > > > > > > > > > 
> > > > > > > > > > > > > > > +CC Mark Brown as he may know of some precedence for this feature. For his reference..
> > > > > > > > > > > > > > > - Each of these device has 2 ID pins.  The SPI transfers have to contain the 2 bit
> > > > > > > > > > > > > > > value that matches that or they are ignored.  Thus a single bus + 1 chip select can
> > > > > > > > > > > > > > > be used to talk to 4 devices.  Question is what that looks like in device tree + I guess
> > > > > > > > > > > > > > > longer term how to support it cleanly in SPI.
> > > > > > > > > > > > > I'd swear I have seen this before, from some Microchip devices. Let me
> > > > > > > > > > > > > see if I can find what I am thinking of...
> > > > > > > > > > > > 
> > > > > > > > > > > > microchip,mcp3911 and microchip,mcp3564 both seem to do this with
> > > > > > > > > > > > slightly different properties.
> > > > > > > > > > > > 
> > > > > > > > > > > >    microchip,device-addr:
> > > > > > > > > > > >      description: Device address when multiple MCP3911 chips are present on the same SPI bus.
> > > > > > > > > > > >      $ref: /schemas/types.yaml#/definitions/uint32
> > > > > > > > > > > >      enum: [0, 1, 2, 3]
> > > > > > > > > > > >      default: 0
> > > > > > > > > > > > 
> > > > > > > > > > > > and
> > > > > > > > > > > > 
> > > > > > > > > > > > 
> > > > > > > > > > > >    microchip,hw-device-address:
> > > > > > > > > > > >      $ref: /schemas/types.yaml#/definitions/uint32
> > > > > > > > > > > >      minimum: 0
> > > > > > > > > > > >      maximum: 3
> > > > > > > > > > > >      description:
> > > > > > > > > > > >        The address is set on a per-device basis by fuses in the factory,
> > > > > > > > > > > >        configured on request. If not requested, the fuses are set for 0x1.
> > > > > > > > > > > >        The device address is part of the device markings to avoid
> > > > > > > > > > > >        potential confusion. This address is coded on two bits, so four possible
> > > > > > > > > > > >        addresses are available when multiple devices are present on the same
> > > > > > > > > > > >        SPI bus with only one Chip Select line for all devices.
> > > > > > > > > > > >        Each device communication starts by a CS falling edge, followed by the
> > > > > > > > > > > >        clocking of the device address (BITS[7:6] - top two bits of COMMAND BYTE
> > > > > > > > > > > >        which is first one on the wire).
> > > > > > > > > > > > 
> > > > > > > > > > > > This sounds exactly like the sort of feature that you're dealing with
> > > > > > > > > > > > here?
> > > > > > > > > > > The core idea yes but for this chip, things are a bit more annoying (but
> > > > > > > > > > > Janani can correct me if I'm wrong). Here, each device can, in theory,
> > > > > > > > > > > have it's own supplies, pins and at the very least, channels with maybe
> > > > > > > > > > > different scales. That is why Janani is proposing dac nodes. Given I
> > > > > > > > > > > honestly don't like much of that "adi,ad5529r-bus" compatible I wondered
> > > > > > > > > > > about solving this at the spi level.
> > > > > > > > > > > 
> > > > > > > > > > > Ah and to make it more annoying, we can also mix 12 and 16 bits variants
> > > > > > > > > > > together in the same bus.
> > > > > > > > > > I'm definitely missing something, because that property for the
> > > > > > > > > > microchip devices is not impacted what else is on the bus. AFAICT, you
> > > > > > > > > > could have an mcp3911 and an mcp3564 on the same bus even though both
> > > > > > > > > > are completely different devices with different drivers. They have
> > > > > > > > > > individual device nodes and their own supplies etc etc. These aren't
> > > > > > > > > > per-channel properties on an adc or dac, they're per child device on a
> > > > > > > > > > spi bus.
> > > > > > > > > Maybe I'm the one missing something :). IIRC, spi would not allow two
> > > > > > > > > devices on the same CS right? Because for this chip we would need
> > > > > > > > > something like:
> > > > > > > > > 
> > > > > > > > > spi {
> > > > > > > > > 	dac@0 {
> > > > > > > > > 		reg = <0>;
> > > > > > > > > 		adi,pin-id = <0>;
> > > > > > > > > 	};
> > > > > > > > > 
> > > > > > > > > 	dac@1 {
> > > > > > > > > 		reg = <0>; // which seems already problematic?
> > > > > > > > > 		adi,pin-id <1>;
> > > > > > > > > 	};
> > > > > > > > > 
> > > > > > > > > 	...
> > > > > > > > > 
> > > > > > > > > 	//up to 4
> > > > > > > > > };
> > > > > > > > Yeah. It's not clear to me how that works for the microchip devices
> > > > > > > > (I suspect it doesn't!)
> > > > > > > > 
> > > > > > > > Just thinking as I type, but could we do something a bit nasty with
> > > > > > > > a gpio mux that doesn't actually switch but represents the GPIO being
> > > > > > > > shared?  Given this is all tied to the spi bus that should all happen
> > > > > > > > under serializing locks.
> > > > > > > > 
> > > > > > > > Agreed though that this would be nicer as an SPI thing that let
> > > > > > > > us specify that a single CS is share by multiple devices and their
> > > > > > > > is some other signal acting to select which one we are talking to.
> > > > > > > If the device-addressing on the same chip-select is to be handled
> > > > > > > by the spi framework, wouldn't we lose device-specific features?
> > > > > > > 
> > > > > > > I understand that this multi-device feature is there mostly to extend the
> > > > > > > channel count from 16 to 32, 48 or 64. I suppose the command:
> > > > > > > 
> > > > > > > 	"MULTI DEVICE SW LDAC MODE"
> > > > > > > 
> > > > > > > exists so that software can update channel values accross multiple devices.
> > > > > > Right! You do have a point! I agree the main driver for a feature like
> > > > > > this is likely to extend the channel count and effectively "aggregate"
> > > > > > devices.
> > > > > > 
> > > > > > But I would say that even with the spi solution the MULTI DEVICE stuff
> > > > > > should be doable (as we still need a sort of adi,pin-id property).
> > > > > I don't think we can have something like an IIO buffer shared by multiple
> > > > > devices. Synchronizing separate devices would be doable with proper hardware
> > > > > support for this (probably involving an FGPA).
> > > > True!
> > > > > > But yes, I do feel that the whole feature is for aggregation so seeing
> > > > > > one device with 32 channels is the expectation here? Rather than seeing
> > > > > > two devices with 16 channels.
> > > > > Yes, I think aggregation is the whole point there... so that the IIO driver
> > > > > is multi-device-aware.
> > > > Which makes me feel that different pins per device might be possible
> > > > from an HW point of view but does not make much sense. For example, for
> > > > the buffer example I would expect LDAC to be shared between all the
> > > > devices.
> > > That is why I would still suggest the multi-dac node in the middle...
> > > the parent node can hold shared resources, while the dac children can
> > > have their own, overriding or inheriting stuff.
> > > 
> > Before going down that path I'd want confirmation this is something we
> > actually think anyone will build.
> > 
> > Jonathan
> 
> To directly answer your question- we currently do not have a platform that supports multi device topology with independent supplies or reset lines.
> Given that, I agree to start with the parallel wiring assumption and defer per chip resource variation under there is a solid use case. I will also drop the "adi,resolution" proposal and proceed with "adi,device-addrs" in the AD5529R binding.
> With all of the above, the proposed binding for the multi-device follow up series would look like:
> 
> 
>     dac@0 {
>         compatible = "adi,ad5529r-16";
>         reg = <0>;
>         adi,device-addrs = <0 1>;

I think this should be put in the channel itself and made generic.

>         reset-gpios = <&gpio0 87 GPIO_ACTIVE_LOW>;
>         vdd-supply  = <&vdd_reg>;
>         hvdd-supply = <&hvdd_reg>;
> 
>         channel@0  { reg = <0>;  adi,output-range-microvolt = <0 5000000>; };
>         channel@16 { reg = <16>; adi,output-range-microvolt = <0 40000000>; };
>     };
> 
> Does this look reasonable to everyone?
> 
> Regards,
> Janani Sunil
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* Re: [PATCH v3 3/7] net: wwan: t9xx: Add control DMA interface
From: Andrew Lunn @ 2026-06-24 16:15 UTC (permalink / raw)
  To: jackbb_wu
  Cc: Loic Poulain, Sergey Ryazanov, Johannes Berg, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Wen-Zhi Huang, Shi-Wei Yeh, Minano Tseng, Matthias Brugger,
	AngeloGioacchino Del Regno, Simon Horman, Jonathan Corbet,
	Shuah Khan, linux-kernel, netdev, linux-arm-kernel,
	linux-mediatek, linux-doc
In-Reply-To: <20260624-t9xx_driver_v1-v3-3-73ff03f60c48@compal.com>

> diff --git a/drivers/net/wwan/t9xx/pcie/mtk_cldma.c b/drivers/net/wwan/t9xx/pcie/mtk_cldma.c

> +static inline void mtk_cldma_clr_bd_dsc(struct cldma_drv_info *drv_info,
> +					struct bd_dsc *bd_dsc_pool, int nr_bds)

No inline functions in C files. Please let the compiler decide.

> +static int mtk_cldma_reload_rx_skb(struct mtk_md_dev *mdev, struct rxq *rxq,
> +				   struct rx_req *req)
> +{
> +	struct sk_buff *tail = NULL;
> +	struct bd_dsc *bd_dsc;
> +	int nr_bds;
> +	int i, ret;
> +
> +	nr_bds = rxq->nr_bds;
> +
> +	for (i = 0; i < nr_bds; i++) {
> +		bd_dsc = req->bd_dsc_pool + i;
> +		bd_dsc->skb = __dev_alloc_skb(req->frag_size, GFP_KERNEL);
> +		if (!bd_dsc->skb) {
> +			dev_warn((mdev)->dev, "Failed to alloc SKB\n");

You might want to rate limit this, and the other similar messages in
the data path, otherwise it could be a DOS.

> +u32 mtk_cldma_stop_queue(struct cldma_drv_info *drv_info, enum mtk_tx_rx dir, u32 qno)
> +{
> +	u32 val = (qno == ALLQ) ? qno : BIT(qno);
> +	struct cldma_hw_regs *hw_regs;
> +	unsigned int active;
> +	int cnt = 0;
> +	int base;
> +	u32 addr;
> +
> +	hw_regs = drv_info->hw_regs;
> +	base = drv_info->base_addr;
> +
> +	if (dir == DIR_TX)
> +		addr = base + hw_regs->reg_cldma_ul_stop_cmd;
> +	else
> +		addr = base + hw_regs->reg_cldma_so_stop_cmd;
> +
> +	mtk_pci_write32(drv_info->mdev, addr, val);
> +
> +	do {
> +		active = drv_info->drv_ops->cldma_queue_status(drv_info, dir, qno);
> +		if (active == LINK_ERROR_VAL || !active)
> +			break;
> +		usleep_range(WAIT_QUEUE_STOP, 2 * WAIT_QUEUE_STOP);
> +	} while (++cnt < 10);

Please use one of the helpers from iopoll.h. Any loops waiting for an
event to happen should use those macros, since the open code
implementation is often wrong.

> +static int mtk_ctrl_trb_srv_init(struct mtk_ctrl_trans *trans)
> +{
> +	struct srv_que *srv_que;
> +	struct trb_srv *srv;
> +	int i, j;
> +	int ret;
> +
> +	for (i = 0; i < trans->trb_srv_num; i++) {
> +		srv = devm_kzalloc(trans->mdev->dev, sizeof(*srv), GFP_KERNEL);
> +		if (!srv) {
> +			ret = -ENOMEM;
> +			goto err_free_srv;
> +		}
> +
> +		srv->trans = trans;
> +		srv->srv_id = i;
> +		trans->trb_srv[i] = srv;
> +
> +		init_waitqueue_head(&srv->trb_waitq);
> +		for (j = 0; j < NR_CLDMA; j++)
> +			INIT_LIST_HEAD(&srv->srv_q_list[j]);
> +	}
> +
> +	for (i = 0; i < NR_CLDMA; i++)
> +		for (j = 0; j < HW_QUE_NUM; j++) {
> +			if (trans->srv_cfg[i][j] < 0 ||
> +			    trans->srv_cfg[i][j] >= trans->trb_srv_num)
> +				trans->srv_cfg[i][j] = 0;
> +			srv_que = devm_kzalloc(trans->mdev->dev, sizeof(*srv_que), GFP_KERNEL);
> +			if (!srv_que) {
> +				ret = -ENOMEM;
> +				goto err_free_srv_que;
> +			}
> +			srv_que->hif_id = i;
> +			srv_que->qno = j;
> +			list_add_tail(&srv_que->list,
> +				      &trans->trb_srv[trans->srv_cfg[i][j]]->srv_q_list[i]);
> +		}
> +
> +	for (i = 0; i < trans->trb_srv_num; i++) {
> +		trans->trb_srv[i]->trb_thread = kthread_run(mtk_ctrl_trb_thread, trans->trb_srv[i],
> +							    "mtk_trb_srv%d_%s", i,
> +							    trans->mdev->dev_str);
> +		if (IS_ERR(trans->trb_srv[i]->trb_thread)) {
> +			ret = PTR_ERR(trans->trb_srv[i]->trb_thread);
> +			trans->trb_srv[i]->trb_thread = NULL;
> +			goto err_stop_kthread;
> +		}
> +	}
> +
> +	return 0;
> +err_stop_kthread:
> +	while (--i >= 0)
> +		kthread_stop(trans->trb_srv[i]->trb_thread);
> +err_free_srv_que:
> +	for (i = 0; i < trans->trb_srv_num; i++) {
> +		for (j = 0; j < NR_CLDMA; j++) {
> +			struct srv_que *next_srv_que;
> +
> +			list_for_each_entry_safe(srv_que, next_srv_que,
> +						 &trans->trb_srv[i]->srv_q_list[j], list) {
> +				list_del(&srv_que->list);
> +				devm_kfree(trans->mdev->dev, srv_que);

It is unusual to see devm_kfree(). Why is it needed?

> +static unsigned int ctrl_port_chl_mtu;

Is this a global variable? Why is it not part of priv?

> +module_param(ctrl_port_chl_mtu, uint, 0644);
> +MODULE_PARM_DESC(ctrl_port_chl_mtu, "This is used to config the ctrl port mtu!\n");

Ah. No modules parameters please. If this is an MTU, why not use the
normal networking interfaces to set the MTU?

	Andrew

^ permalink raw reply

* Re: [PATCH v3 2/7] net: wwan: t9xx: Add control plane transaction layer
From: Andrew Lunn @ 2026-06-24 16:00 UTC (permalink / raw)
  To: jackbb_wu
  Cc: Loic Poulain, Sergey Ryazanov, Johannes Berg, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Wen-Zhi Huang, Shi-Wei Yeh, Minano Tseng, Matthias Brugger,
	AngeloGioacchino Del Regno, Simon Horman, Jonathan Corbet,
	Shuah Khan, linux-kernel, netdev, linux-arm-kernel,
	linux-mediatek, linux-doc
In-Reply-To: <20260624-t9xx_driver_v1-v3-2-73ff03f60c48@compal.com>

> +static int __init mtk_common_drv_init(void)
> +{
> +	return 0;
> +}
> +module_init(mtk_common_drv_init);
> +
> +static void __exit mtk_common_drv_exit(void)
> +{
> +}
> +module_exit(mtk_common_drv_exit);

Since these don't do anything, they should not be needed.

> @@ -467,6 +468,7 @@ static u32 mtk_pci_ext_h2d_evt_hw_bits(u32 chs)
>  
>  	SET_HW_BITS(hw_bits, chs, MHCCIF_RC2EP_EVT_DEVICE_RESET,
>  		    DEV_EVT_H2D_DEVICE_RESET);
> +
>  	return LE32_TO_U32(cpu_to_le32(hw_bits));

Please don't add white space like this. I assume a previous patch
added this code, so move this to that patch.

> @@ -908,13 +910,11 @@ static int mtk_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  	struct mtk_md_dev *mdev;
>  	int ret;
>  
> -	mdev = devm_kzalloc(dev, sizeof(*mdev), GFP_KERNEL);
> +	mdev = mtk_dev_alloc(dev, &pci_hw_ops);
>  	if (!mdev) {
>  		ret = -ENOMEM;
>  		goto log_err;
>  	}
> -	mdev->dev_ops = &pci_hw_ops;
> -	mdev->dev = dev;
>  
>  	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
>  	if (!priv) {
> @@ -991,7 +991,7 @@ static int mtk_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  free_priv_data:
>  	devm_kfree(dev, priv);
>  free_cntx_data:
> -	devm_kfree(dev, mdev);
> +	mtk_dev_free(mdev);

Why are you removing devm_ calls?

	Andrew

^ permalink raw reply

* Re: [PATCH v3 1/7] net: wwan: t9xx: Add PCIe core
From: Andrew Lunn @ 2026-06-24 15:56 UTC (permalink / raw)
  To: jackbb_wu
  Cc: Loic Poulain, Sergey Ryazanov, Johannes Berg, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Wen-Zhi Huang, Shi-Wei Yeh, Minano Tseng, Matthias Brugger,
	AngeloGioacchino Del Regno, Simon Horman, Jonathan Corbet,
	Shuah Khan, linux-kernel, netdev, linux-arm-kernel,
	linux-mediatek, linux-doc
In-Reply-To: <20260624-t9xx_driver_v1-v3-1-73ff03f60c48@compal.com>


> From: Jack Wu <jackbb_wu@compal.com>
> 
> Registers the T900 device driver with the kernel. Set up all
> the fundamental configurations for the device: PCIe layer,
> Modem Host Cross Core Interface (MHCCIF), Reset Generation
> Unit (RGU), modem common control operations and build
> infrastructure.
> 
> * PCIe layer code implements driver probe and removal, MSI-X
>   interrupt initialization and de-initialization, and the way
>   of resetting the device.
> * MHCCIF provides interrupt channels to communicate events
>   such as handshake, PM and port enumeration.
> * RGU provides interrupt channels to generate notifications
>   from the device so that the T900 driver could get the
>   device reset.
> * Modem common control operations provide the basic read/write
>   functions of the device's hardware registers,
>   mask/unmask/get/clear functions of the device's interrupt
>   registers and inquiry functions of the device's status.
> 
> Signed-off-by: Jack Wu <jackbb_wu@compal.com>
> ---
>  drivers/net/wwan/Kconfig                      |   12 +
>  drivers/net/wwan/Makefile                     |    1 +
>  drivers/net/wwan/t9xx/Makefile                |   10 +
>  drivers/net/wwan/t9xx/mtk_dev.h               |  108 +++
>  drivers/net/wwan/t9xx/pcie/mtk_pci.c          | 1049 +++++++++++++++++++++++++
>  drivers/net/wwan/t9xx/pcie/mtk_pci.h          |  234 ++++++
>  drivers/net/wwan/t9xx/pcie/mtk_pci_drv_m9xx.c |   69 ++
>  drivers/net/wwan/t9xx/pcie/mtk_pci_reg.h      |   70 ++
>  8 files changed, 1553 insertions(+)
> 
> diff --git a/drivers/net/wwan/Kconfig b/drivers/net/wwan/Kconfig
> index 88df55d78d90..4cee537c739f 100644
> --- a/drivers/net/wwan/Kconfig
> +++ b/drivers/net/wwan/Kconfig
> @@ -121,6 +121,18 @@ config MTK_T7XX
>  
>  	  If unsure, say N.
>  
> +config MTK_T9XX
> +	tristate "MediaTek PCIe 5G WWAN modem T9xx device"
> +	depends on PCI
> +	select NET_DEVLINK
> +	help
> +	  Enables MediaTek PCIe based 5G WWAN modem (T9xx series) device.
> +
> +	  To compile this driver as a module, choose M here: the module will be
> +	  called mtk_t9xx.
> +
> +	  If unsure, say N.
> +
>  endif # WWAN
>  
>  endmenu
> diff --git a/drivers/net/wwan/Makefile b/drivers/net/wwan/Makefile
> index 3960c0ae2445..7361eef4c472 100644
> --- a/drivers/net/wwan/Makefile
> +++ b/drivers/net/wwan/Makefile
> @@ -14,3 +14,4 @@ obj-$(CONFIG_QCOM_BAM_DMUX) += qcom_bam_dmux.o
>  obj-$(CONFIG_RPMSG_WWAN_CTRL) += rpmsg_wwan_ctrl.o
>  obj-$(CONFIG_IOSM) += iosm/
>  obj-$(CONFIG_MTK_T7XX) += t7xx/
> +obj-$(CONFIG_MTK_T9XX) += t9xx/
> diff --git a/drivers/net/wwan/t9xx/Makefile b/drivers/net/wwan/t9xx/Makefile
> new file mode 100644
> index 000000000000..6f2dd3f91454
> --- /dev/null
> +++ b/drivers/net/wwan/t9xx/Makefile
> @@ -0,0 +1,10 @@
> +# SPDX-License-Identifier: GPL-2.0-only
> +
> +ccflags-y += -I$(src)/pcie
> +ccflags-y += -I$(src)
> +
> +obj-$(CONFIG_MTK_T9XX) += mtk_t9xx.o
> +
> +mtk_t9xx-y := \
> +	pcie/mtk_pci.o \
> +	pcie/mtk_pci_drv_m9xx.o
> diff --git a/drivers/net/wwan/t9xx/mtk_dev.h b/drivers/net/wwan/t9xx/mtk_dev.h
> new file mode 100644
> index 000000000000..8278a0e2875e
> --- /dev/null
> +++ b/drivers/net/wwan/t9xx/mtk_dev.h
> @@ -0,0 +1,108 @@
> +/* SPDX-License-Identifier: GPL-2.0-only
> + *
> + * Copyright (c) 2022, MediaTek Inc.
> + */
> +
> +#ifndef __MTK_DEV_H__
> +#define __MTK_DEV_H__
> +
> +#include <linux/dma-mapping.h>
> +#include <linux/dmapool.h>
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +
> +#define MTK_DEV_STR_LEN 16
> +
> +enum mtk_user_id {
> +	MTK_USER_MIN,
> +	MTK_USER_CTRL,
> +	MTK_USER_DATA,
> +	MTK_USER_MAX
> +};
> +
> +enum mtk_dev_evt_h2d {
> +	DEV_EVT_H2D_DEVICE_RESET	= BIT(2),
> +	DEV_EVT_H2D_MAX			= BIT(5)
> +};
> +
> +enum mtk_dev_evt_d2h {
> +	DEV_EVT_D2H_BOOT_FLOW_SYNC	= BIT(4),
> +	DEV_EVT_D2H_ASYNC_HS_NOTIFY_SAP = BIT(5),
> +	DEV_EVT_D2H_ASYNC_HS_NOTIFY_MD	= BIT(6),
> +	DEV_EVT_D2H_MAX			= BIT(11)
> +};
> +
> +struct mtk_md_dev;
> +
> +struct mtk_dev_ops {
> +	u32 (*get_dev_state)(struct mtk_md_dev *mdev);
> +	void (*ack_dev_state)(struct mtk_md_dev *mdev, u32 state);
> +	u32 (*get_dev_cfg)(struct mtk_md_dev *mdev);
> +	int (*register_dev_evt)(struct mtk_md_dev *mdev, u32 dev_evt,
> +				int (*evt_cb)(u32 status, void *data), void *data);
> +	void (*unregister_dev_evt)(struct mtk_md_dev *mdev, u32 dev_evt);
> +	void (*mask_dev_evt)(struct mtk_md_dev *mdev, u32 dev_evt);
> +	void (*unmask_dev_evt)(struct mtk_md_dev *mdev, u32 dev_evt);
> +	void (*clear_dev_evt)(struct mtk_md_dev *mdev, u32 dev_evt);
> +	int (*send_dev_evt)(struct mtk_md_dev *mdev, u32 dev_evt);
> +};
> +
> +/* mtk_md_dev defines the structure of MTK modem device */
> +struct mtk_md_dev {
> +	struct device *dev;
> +	const struct mtk_dev_ops *dev_ops;
> +	void *hw_priv;
> +	u32 hw_ver;
> +	char dev_str[MTK_DEV_STR_LEN];
> +};
> +
> +static inline u32 mtk_dev_get_dev_state(struct mtk_md_dev *mdev)
> +{
> +	return mdev->dev_ops->get_dev_state(mdev);
> +}
> +
> +static inline void mtk_dev_ack_dev_state(struct mtk_md_dev *mdev, u32 state)
> +{
> +	return mdev->dev_ops->ack_dev_state(mdev, state);
> +}
> +
> +static inline u32 mtk_dev_get_dev_cfg(struct mtk_md_dev *mdev)
> +{
> +	return mdev->dev_ops->get_dev_cfg(mdev);
> +}
> +
> +static inline int mtk_dev_register_dev_evt(struct mtk_md_dev *mdev, u32 dev_evt,
> +					   int (*evt_cb)(u32 status, void *data), void *data)
> +{
> +	return mdev->dev_ops->register_dev_evt(mdev, dev_evt, evt_cb, data);
> +}
> +
> +static inline void mtk_dev_unregister_dev_evt(struct mtk_md_dev *mdev, u32 dev_evt)
> +{
> +	mdev->dev_ops->unregister_dev_evt(mdev, dev_evt);
> +}
> +
> +static inline void mtk_dev_mask_dev_evt(struct mtk_md_dev *mdev, u32 dev_evt)
> +{
> +	mdev->dev_ops->mask_dev_evt(mdev, dev_evt);
> +}
> +
> +static inline void mtk_dev_unmask_dev_evt(struct mtk_md_dev *mdev, u32 dev_evt)
> +{
> +	mdev->dev_ops->unmask_dev_evt(mdev, dev_evt);
> +}
> +
> +static inline void mtk_dev_clear_dev_evt(struct mtk_md_dev *mdev, u32 dev_evt)
> +{
> +	mdev->dev_ops->clear_dev_evt(mdev, dev_evt);
> +}
> +
> +static inline int mtk_dev_send_dev_evt(struct mtk_md_dev *mdev, u32 dev_evt)
> +{
> +	return mdev->dev_ops->send_dev_evt(mdev, dev_evt);
> +}
> +
> +#endif /* __MTK_DEV_H__ */
> diff --git a/drivers/net/wwan/t9xx/pcie/mtk_pci.c b/drivers/net/wwan/t9xx/pcie/mtk_pci.c
> new file mode 100644
> index 000000000000..c6a7196fcdd6
> --- /dev/null
> +++ b/drivers/net/wwan/t9xx/pcie/mtk_pci.c
> @@ -0,0 +1,1049 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) 2022, MediaTek Inc.
> + */
> +
> +#include <linux/acpi.h>
> +#include <linux/aer.h>
> +#include <linux/bitfield.h>
> +#include <linux/debugfs.h>
> +#include <linux/delay.h>
> +#include <linux/device.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +
> +#include "mtk_dev.h"
> +#include "mtk_pci.h"
> +#include "mtk_pci_reg.h"
> +
> +#define MTK_PCI_BAR_NUM		6
> +#define MTK_PCI_TRANSPARENT_ATR_SIZE	(0x3F)
> +#define MTK_PCI_MINIMUM_ATR_SIZE	(0x1000)
> +#define ATR_SIZE_LO32_MASK		GENMASK_ULL(31, 0)
> +#define ATR_SIZE_HI32_MASK		GENMASK_ULL(63, 32)
> +#define ATR_SIZE_BIAS_FROM_LO32		2
> +#define ATR_ADDR_ALIGN_MASK		0xFFFFF000
> +#define ATR_EN				BIT(0)
> +#define ATR_PARAM_OFFSET		16
> +/* Delay between ACPI PXP._OFF and _ON for modem power cycle stabilization */
> +#define MTK_PLDR_POWER_OFF_DELAY_MS	500
> +#define LE32_TO_U32(x) ((__force u32)(__le32)(x))
> +#define SET_HW_BITS(dest, chs, mhccif, dev)		\
> +	({						\
> +		if ((chs) & (dev))					\
> +			(dest) |= FIELD_PREP(mhccif, 1);		\
> +	})
> +
> +struct mtk_mhccif_cb {
> +	struct list_head entry;
> +	int (*evt_cb)(u32 status, void *data);
> +	void *data;
> +	u32 chs;
> +};
> +
> +/**
> + * mtk_pci_setup_atr() - Configure a PCIe address translation rule
> + * @mdev: MTK MD device
> + * @cfg: ATR configuration parameters
> + *
> + * Return: 0 on success, negative error code on failure.
> + */
> +int mtk_pci_setup_atr(struct mtk_md_dev *mdev, struct mtk_atr_cfg *cfg)
> +{
> +	struct mtk_pci_priv *priv = mdev->hw_priv;
> +	u32 addr, val, size_h, size_l;
> +	int atr_size, pos, offset;
> +
> +	if (cfg->transparent) {
> +		/* No address conversion is performed */
> +		atr_size = MTK_PCI_TRANSPARENT_ATR_SIZE;
> +	} else {
> +		if (cfg->size < MTK_PCI_MINIMUM_ATR_SIZE)
> +			cfg->size = MTK_PCI_MINIMUM_ATR_SIZE;
> +
> +		if (cfg->src_addr & (cfg->size - 1)) {
> +			dev_err((mdev)->dev, "Invalid atr src addr is not aligned to size\n");
> +			return -EFAULT;
> +		}
> +
> +		if (cfg->trsl_addr & (cfg->size - 1)) {
> +			dev_err((mdev)->dev,
> +				"Invalid atr trsl addr is not aligned to size, %llx, %llx\n",
> +				cfg->trsl_addr, cfg->size - 1);
> +			return -EFAULT;
> +		}
> +
> +		size_l = FIELD_GET(ATR_SIZE_LO32_MASK, cfg->size);
> +		size_h = FIELD_GET(ATR_SIZE_HI32_MASK, cfg->size);
> +		pos = ffs(size_l);
> +		if (pos) {
> +			atr_size = pos - ATR_SIZE_BIAS_FROM_LO32;
> +		} else {
> +			pos = ffs(size_h);
> +			atr_size = pos + 32 - ATR_SIZE_BIAS_FROM_LO32;
> +		}
> +	}
> +
> +	/* Calculate table offset */
> +	offset = ATR_PORT_OFFSET * cfg->port + ATR_TABLE_OFFSET * cfg->table;
> +	addr = REG_ATR_PCIE_WIN0_T0_SRC_ADDR_MSB + offset;
> +	val = (u32)(cfg->src_addr >> 32);
> +	mtk_pci_mac_write32(priv, addr, val);
> +
> +	addr = REG_ATR_PCIE_WIN0_T0_SRC_ADDR_LSB + offset;
> +	val = (u32)(cfg->src_addr & ATR_ADDR_ALIGN_MASK) | (atr_size << 1) | ATR_EN;
> +	mtk_pci_mac_write32(priv, addr, val);
> +
> +	addr = REG_ATR_PCIE_WIN0_T0_TRSL_ADDR_MSB + offset;
> +	val = (u32)(cfg->trsl_addr >> 32);
> +	mtk_pci_mac_write32(priv, addr, val);
> +
> +	addr = REG_ATR_PCIE_WIN0_T0_TRSL_ADDR_LSB + offset;
> +	val = (u32)(cfg->trsl_addr & ATR_ADDR_ALIGN_MASK);
> +	mtk_pci_mac_write32(priv, addr, val);
> +
> +	/* TRSL_PARAM */
> +	addr = REG_ATR_PCIE_WIN0_T0_TRSL_PARAM + offset;
> +	val = (cfg->trsl_param << ATR_PARAM_OFFSET) | cfg->trsl_id;
> +	mtk_pci_mac_write32(priv, addr, val);
> +
> +	return 0;
> +}
> +
> +/**
> + * mtk_pci_atr_disable() - Disable all PCIe address translation rules
> + * @priv: MTK PCI private data
> + */
> +void mtk_pci_atr_disable(struct mtk_pci_priv *priv)
> +{
> +	int port, tbl, offset;
> +	u32 val;
> +
> +	/* Disable all ATR table for all ports */
> +	for (port = ATR_SRC_PCI_WIN0; port <= ATR_SRC_AXIS_3; port++)
> +		for (tbl = 0; tbl < ATR_TABLE_NUM_PER_ATR; tbl++) {
> +			/* Calculate table offset */
> +			offset = ATR_PORT_OFFSET * port + ATR_TABLE_OFFSET * tbl;
> +			val = mtk_pci_mac_read32(priv, REG_ATR_PCIE_WIN0_T0_SRC_ADDR_LSB + offset);
> +			val = val & (~BIT(0));
> +			/* Disable table by SRC_ADDR_L */
> +			mtk_pci_mac_write32(priv, REG_ATR_PCIE_WIN0_T0_SRC_ADDR_LSB + offset, val);
> +		}
> +}
> +
> +static void mtk_pci_set_msix_merged(struct mtk_pci_priv *priv, int irq_cnt)
> +{
> +	mtk_pci_mac_write32(priv, REG_PCIE_CFG_MSIX, ffs(irq_cnt) * 2 - 1);
> +}
> +
> +/**
> + * mtk_pci_get_dev_state() - Read the device state from the modem
> + * @mdev: MTK MD device
> + *
> + * Return: Device state value.
> + */
> +u32 mtk_pci_get_dev_state(struct mtk_md_dev *mdev)
> +{
> +	return mtk_pci_mac_read32(mdev->hw_priv, REG_PCIE_DEBUG_DUMMY_7);
> +}
> +
> +/**
> + * mtk_pci_ack_dev_state() - Acknowledge the device state to the modem
> + * @mdev: MTK MD device
> + * @state: State value to acknowledge
> + */
> +void mtk_pci_ack_dev_state(struct mtk_md_dev *mdev, u32 state)
> +{
> +	mtk_pci_mac_write32(mdev->hw_priv, REG_PCIE_DEBUG_DUMMY_7, state);
> +}
> +
> +/**
> + * mtk_pci_get_irq_id() - Map an IRQ source to its hardware IRQ ID
> + * @mdev: MTK MD device
> + * @irq_src: IRQ source enum
> + *
> + * Return: IRQ ID on success, -EINVAL on failure.
> + */
> +int mtk_pci_get_irq_id(struct mtk_md_dev *mdev, enum mtk_irq_src irq_src)
> +{
> +	struct mtk_pci_priv *priv = mdev->hw_priv;
> +	const int *irq_tbl = priv->cfg->irq_tbl;
> +	int irq_id = -EINVAL;
> +
> +	if (irq_src > MTK_IRQ_SRC_MIN && irq_src < MTK_IRQ_SRC_MAX) {
> +		irq_id = irq_tbl[irq_src];
> +		if (irq_id < 0 || irq_id >= MTK_IRQ_CNT_MAX)
> +			irq_id = -EINVAL;
> +	}
> +
> +	return irq_id;
> +}
> +
> +/**
> + * mtk_pci_get_virq_id() - Get the Linux virtual IRQ for a hardware IRQ ID
> + * @mdev: MTK MD device
> + * @irq_id: Hardware IRQ ID
> + *
> + * Return: Virtual IRQ number on success, negative error code on failure.
> + */
> +int mtk_pci_get_virq_id(struct mtk_md_dev *mdev, int irq_id)
> +{
> +	struct pci_dev *pdev = to_pci_dev(mdev->dev);
> +	struct mtk_pci_priv *priv = mdev->hw_priv;
> +
> +	if (!priv->irq_cnt || irq_id < 0)
> +		return -EINVAL;
> +
> +	return pci_irq_vector(pdev, irq_id % priv->irq_cnt);
> +}
> +
> +/**
> + * mtk_pci_register_irq() - Register a callback for a hardware IRQ
> + * @mdev: MTK MD device
> + * @irq_id: Hardware IRQ ID
> + * @irq_cb: Callback function
> + * @data: Private data passed to callback
> + *
> + * Return: 0 on success, negative error code on failure.
> + */
> +int mtk_pci_register_irq(struct mtk_md_dev *mdev, int irq_id,
> +			 int (*irq_cb)(int irq_id, void *data), void *data)
> +{
> +	struct mtk_pci_priv *priv = mdev->hw_priv;
> +
> +	if ((irq_id < 0 || irq_id >= MTK_IRQ_CNT_MAX) || !irq_cb)
> +		return -EINVAL;
> +
> +	if (priv->irq_cb_list[irq_id]) {
> +		dev_err((mdev)->dev,
> +			"Unable to register irq, irq_id=%d, it's already been register by %ps.\n",
> +			irq_id, priv->irq_cb_list[irq_id]);
> +		return -EFAULT;
> +	}
> +	priv->irq_cb_list[irq_id] = irq_cb;
> +	priv->irq_cb_data[irq_id] = data;
> +
> +	return 0;
> +}
> +
> +/**
> + * mtk_pci_unregister_irq() - Unregister a hardware IRQ callback
> + * @mdev: MTK MD device
> + * @irq_id: Hardware IRQ ID
> + *
> + * Return: 0 on success, negative error code on failure.
> + */
> +int mtk_pci_unregister_irq(struct mtk_md_dev *mdev, int irq_id)
> +{
> +	struct mtk_pci_priv *priv = mdev->hw_priv;
> +
> +	if (irq_id < 0 || irq_id >= MTK_IRQ_CNT_MAX)
> +		return -EINVAL;
> +
> +	if (!priv->irq_cb_list[irq_id]) {
> +		dev_err((mdev)->dev, "irq_id=%d has not been registered\n", irq_id);
> +		return -EFAULT;
> +	}
> +	priv->irq_cb_list[irq_id] = NULL;
> +	priv->irq_cb_data[irq_id] = NULL;
> +
> +	return 0;
> +}
> +
> +/**
> + * mtk_pci_mask_irq() - Mask (disable) a hardware IRQ
> + * @mdev: MTK MD device
> + * @irq_id: Hardware IRQ ID
> + *
> + * Return: 0 on success, negative error code on failure.
> + */
> +int mtk_pci_mask_irq(struct mtk_md_dev *mdev, int irq_id)
> +{
> +	struct mtk_pci_priv *priv = mdev->hw_priv;
> +
> +	if (irq_id < 0 || irq_id >= MTK_IRQ_CNT_MAX ||
> +	    priv->irq_type != PCI_IRQ_MSIX) {
> +		dev_err(mdev->dev, "Failed to mask irq: input irq_id=%d\n", irq_id);
> +		return -EINVAL;
> +	}
> +
> +	mtk_pci_mac_write32(priv, REG_IMASK_HOST_MSIX_CLR_GRP0_0, BIT(irq_id));
> +
> +	return 0;
> +}
> +
> +/**
> + * mtk_pci_unmask_irq() - Unmask (enable) a hardware IRQ
> + * @mdev: MTK MD device
> + * @irq_id: Hardware IRQ ID
> + *
> + * Return: 0 on success, negative error code on failure.
> + */
> +int mtk_pci_unmask_irq(struct mtk_md_dev *mdev, int irq_id)
> +{
> +	struct mtk_pci_priv *priv = mdev->hw_priv;
> +
> +	if (irq_id < 0 || irq_id >= MTK_IRQ_CNT_MAX ||
> +	    priv->irq_type != PCI_IRQ_MSIX) {
> +		dev_err(mdev->dev, "Failed to unmask irq: input irq_id=%d\n", irq_id);
> +		return -EINVAL;
> +	}
> +
> +	mtk_pci_mac_write32(priv, REG_IMASK_HOST_MSIX_SET_GRP0_0, BIT(irq_id));
> +
> +	return 0;
> +}
> +
> +/**
> + * mtk_pci_clear_irq() - Clear (acknowledge) a hardware IRQ
> + * @mdev: MTK MD device
> + * @irq_id: Hardware IRQ ID
> + *
> + * Return: 0 on success, negative error code on failure.
> + */
> +int mtk_pci_clear_irq(struct mtk_md_dev *mdev, int irq_id)
> +{
> +	struct mtk_pci_priv *priv = mdev->hw_priv;
> +
> +	if (irq_id < 0 || irq_id >= MTK_IRQ_CNT_MAX ||
> +	    priv->irq_type != PCI_IRQ_MSIX) {
> +		dev_err(mdev->dev, "Failed to clear irq: input irq_id=%d\n", irq_id);
> +		return -EINVAL;
> +	}
> +
> +	mtk_pci_mac_write32(priv, REG_MSIX_ISTATUS_HOST_GRP0_0, BIT(irq_id));
> +
> +	return 0;
> +}
> +
> +static u32 mtk_pci_ext_d2h_evt_hw_bits(u32 chs)
> +{
> +	u32 hw_bits = 0;
> +
> +	SET_HW_BITS(hw_bits, chs, MHCCIF_EP2RC_EVT_BOOT_FLOW_SYNC,
> +		    DEV_EVT_D2H_BOOT_FLOW_SYNC);
> +	SET_HW_BITS(hw_bits, chs, MHCCIF_EP2RC_EVT_ASYNC_HS_NOTIFY_SAP,
> +		    DEV_EVT_D2H_ASYNC_HS_NOTIFY_SAP);
> +	SET_HW_BITS(hw_bits, chs, MHCCIF_EP2RC_EVT_ASYNC_HS_NOTIFY_MD,
> +		    DEV_EVT_D2H_ASYNC_HS_NOTIFY_MD);
> +
> +	return LE32_TO_U32(cpu_to_le32(hw_bits));
> +}
> +
> +static u32 mtk_pci_ext_d2h_evt_chs(u32 hw_bits)
> +{
> +	u32 chs = 0;
> +
> +	if (!hw_bits)
> +		return chs;
> +
> +	chs = FIELD_PREP(DEV_EVT_D2H_BOOT_FLOW_SYNC,
> +			 FIELD_GET(MHCCIF_EP2RC_EVT_BOOT_FLOW_SYNC, hw_bits)) |
> +	      FIELD_PREP(DEV_EVT_D2H_ASYNC_HS_NOTIFY_SAP,
> +			 FIELD_GET(MHCCIF_EP2RC_EVT_ASYNC_HS_NOTIFY_SAP, hw_bits)) |
> +	      FIELD_PREP(DEV_EVT_D2H_ASYNC_HS_NOTIFY_MD,
> +			 FIELD_GET(MHCCIF_EP2RC_EVT_ASYNC_HS_NOTIFY_MD, hw_bits));
> +
> +	return chs;
> +}
> +
> +/**
> + * mtk_pci_register_ext_evt() - Register a callback for MHCCIF device events
> + * @mdev: MTK MD device
> + * @chs: Bitmask of event channels to register
> + * @evt_cb: Callback function
> + * @data: Private data passed to callback
> + *
> + * Return: 0 on success, negative error code on failure.
> + */
> +int mtk_pci_register_ext_evt(struct mtk_md_dev *mdev, u32 chs,
> +			     int (*evt_cb)(u32 status, void *data), void *data)
> +{
> +	struct mtk_pci_priv *priv = mdev->hw_priv;
> +	struct mtk_mhccif_cb *cb;
> +	int ret = 0;
> +
> +	if (!chs || !evt_cb)
> +		return -EINVAL;
> +
> +	spin_lock_bh(&priv->mhccif_lock);
> +	list_for_each_entry(cb, &priv->mhccif_cb_list, entry) {
> +		if (cb->chs & chs) {
> +			ret = -EFAULT;
> +			dev_err((mdev)->dev,
> +				"Unable to register evt, intersection: chs=0x%08x&0x%08x cb=%ps\n",
> +				chs, cb->chs, cb->evt_cb);
> +			goto err_spin_unlock;
> +		}
> +	}
> +	cb = devm_kzalloc(mdev->dev, sizeof(*cb), GFP_ATOMIC);
> +	if (!cb) {
> +		ret = -ENOMEM;
> +		goto err_spin_unlock;
> +	}
> +	cb->evt_cb = evt_cb;
> +	cb->data = data;
> +	cb->chs = chs;
> +	list_add_tail(&cb->entry, &priv->mhccif_cb_list);
> +err_spin_unlock:
> +	spin_unlock_bh(&priv->mhccif_lock);
> +
> +	return ret;
> +}
> +
> +/**
> + * mtk_pci_unregister_ext_evt() - Unregister an MHCCIF device event callback
> + * @mdev: MTK MD device
> + * @chs: Bitmask of event channels to unregister
> + */
> +void mtk_pci_unregister_ext_evt(struct mtk_md_dev *mdev, u32 chs)
> +{
> +	struct mtk_pci_priv *priv = mdev->hw_priv;
> +	struct mtk_mhccif_cb *cb, *next;
> +
> +	if (!chs)
> +		return;
> +
> +	spin_lock_bh(&priv->mhccif_lock);
> +	list_for_each_entry_safe(cb, next, &priv->mhccif_cb_list, entry) {
> +		if (cb->chs == chs) {
> +			list_del(&cb->entry);
> +			devm_kfree(mdev->dev, cb);
> +			goto out;
> +		}
> +	}
> +	dev_warn((mdev)->dev,
> +		 "Unable to unregister evt, no chs=0x%08x has been registered.\n", chs);
> +out:
> +	spin_unlock_bh(&priv->mhccif_lock);
> +}
> +
> +/**
> + * mtk_pci_mask_ext_evt() - Mask (disable) MHCCIF device events
> + * @mdev: MTK MD device
> + * @chs: Bitmask of event channels to mask
> + */
> +void mtk_pci_mask_ext_evt(struct mtk_md_dev *mdev, u32 chs)
> +{
> +	struct mtk_pci_priv *priv = mdev->hw_priv;
> +	u32 hw_bits = mtk_pci_ext_d2h_evt_hw_bits(chs);
> +
> +	mtk_pci_write32(mdev, priv->cfg->mhccif_rc_base_addr +
> +			MHCCIF_EP2RC_SW_INT_EAP_MASK_SET, hw_bits);
> +}
> +
> +/**
> + * mtk_pci_unmask_ext_evt() - Unmask (enable) MHCCIF device events
> + * @mdev: MTK MD device
> + * @chs: Bitmask of event channels to unmask
> + */
> +void mtk_pci_unmask_ext_evt(struct mtk_md_dev *mdev, u32 chs)
> +{
> +	struct mtk_pci_priv *priv = mdev->hw_priv;
> +	u32 hw_bits = mtk_pci_ext_d2h_evt_hw_bits(chs);
> +
> +	mtk_pci_write32(mdev, priv->cfg->mhccif_rc_base_addr +
> +			MHCCIF_EP2RC_SW_INT_EAP_MASK_CLR, hw_bits);
> +}
> +
> +/**
> + * mtk_pci_clear_ext_evt() - Clear (acknowledge) MHCCIF device events
> + * @mdev: MTK MD device
> + * @chs: Bitmask of event channels to clear
> + */
> +void mtk_pci_clear_ext_evt(struct mtk_md_dev *mdev, u32 chs)
> +{
> +	struct mtk_pci_priv *priv = mdev->hw_priv;
> +	u32 hw_bits = mtk_pci_ext_d2h_evt_hw_bits(chs);
> +
> +	mtk_pci_write32(mdev, priv->cfg->mhccif_rc_base_addr +
> +			MHCCIF_EP2RC_SW_INT_ACK, hw_bits);
> +}
> +
> +static u32 mtk_pci_ext_h2d_evt_hw_bits(u32 chs)
> +{
> +	u32 hw_bits = 0;
> +
> +	SET_HW_BITS(hw_bits, chs, MHCCIF_RC2EP_EVT_DEVICE_RESET,
> +		    DEV_EVT_H2D_DEVICE_RESET);
> +	return LE32_TO_U32(cpu_to_le32(hw_bits));
> +}
> +
> +/**
> + * mtk_pci_send_ext_evt() - Send an MHCCIF event to the modem
> + * @mdev: MTK MD device
> + * @ch: Event channel to trigger (must be a single bit)
> + *
> + * Return: 0 on success, negative error code on failure.
> + */
> +int mtk_pci_send_ext_evt(struct mtk_md_dev *mdev, u32 ch)
> +{
> +	struct mtk_pci_priv *priv = mdev->hw_priv;
> +	u32 rc_base, hw_bits;
> +
> +	rc_base = priv->cfg->mhccif_rc_base_addr;
> +
> +	/* Only allow one ch to be triggered at a time */
> +	if (!is_power_of_2(ch)) {
> +		dev_err((mdev)->dev, "Unsupported ext evt ch=0x%08x\n", ch);
> +		return -EINVAL;
> +	}
> +
> +	hw_bits = mtk_pci_ext_h2d_evt_hw_bits(ch);
> +	mtk_pci_write32(mdev, rc_base + MHCCIF_RC2EP_SW_BSY, hw_bits);
> +	mtk_pci_write32(mdev, rc_base + MHCCIF_RC2EP_SW_TCHNUM, ffs(hw_bits) - 1);
> +	return 0;
> +}
> +
> +static u32 mtk_pci_get_ext_evt_hw_status(struct mtk_md_dev *mdev)
> +{
> +	struct mtk_pci_priv *priv = mdev->hw_priv;
> +
> +	return mtk_pci_read32(mdev, priv->cfg->mhccif_rc_base_addr +
> +			      MHCCIF_EP2RC_SW_INT_STS);
> +}
> +
> +/**
> + * mtk_pci_fldr() - Perform a Function Level Device Reset via ACPI _RST
> + * @mdev: MTK MD device
> + *
> + * Return: 0 on success, negative error code on failure.
> + */
> +int mtk_pci_fldr(struct mtk_md_dev *mdev)
> +{
> +#ifdef CONFIG_ACPI

...

> +#else /* !CONFIG_ACPI */
> +	dev_err((mdev)->dev, "Unsupported, CONFIG ACPI hasn't been set to 'y'\n");

Why not just have the Kconfig depend on ACPI?

> +	if (ret) {
> +		dev_err((mdev)->dev, "Failed to register mhccif_irq callback\n");

Why the () around mdev?


    Andrew

---
pw-bot: cr

^ permalink raw reply

* Re: [PATCH v6 6/8] Documentation: bootconfig: document build-time cmdline rendering
From: Breno Leitao @ 2026-06-24 15:50 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Andrew Morton, Nathan Chancellor, paulmck, Nicolas Schier,
	Nick Desaulniers, Bill Wendling, Justin Stitt, Jonathan Corbet,
	Shuah Khan, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
	Dave Hansen, x86, H. Peter Anvin, linux-kernel,
	linux-trace-kernel, linux-kbuild, bpf, llvm, linux-doc,
	kernel-team
In-Reply-To: <20260624174737.a4862dcd86f3d746b788d197@kernel.org>

On Wed, Jun 24, 2026 at 05:47:37PM +0900, Masami Hiramatsu wrote:
> On Tue, 23 Jun 2026 09:15:33 -0700
> >
> > +The option requires ``CONFIG_BOOT_CONFIG_EMBED=y``, a non-empty
> > +``CONFIG_BOOT_CONFIG_EMBED_FILE``, and an architecture that selects
> > +``CONFIG_ARCH_SUPPORTS_CMDLINE_FROM_BOOTCONFIG``. Currently only x86
> > +selects it; on other architectures the embedded bootconfig still works,
> > +but only through the late runtime parser.
> 
> As commented by Sashiko, here we need to mention that this option requires
> CONFIG_CMDLINE to be empty. This means user can NOT set both option
> at once (This also means user doesn't have to worry about configuration
> conflicts.)

Ack! I will update.

--breno

^ permalink raw reply

* Re: [PATCH v3 1/2] dt-bindings: iio: dac: Add AD5529R
From: David Lechner @ 2026-06-24 15:13 UTC (permalink / raw)
  To: Janani Sunil, Jonathan Cameron, Rodrigo Alencar
  Cc: Nuno Sá, Conor Dooley, Janani Sunil, Lars-Peter Clausen,
	Michael Hennerich, Nuno Sá, Andy Shevchenko, Rob Herring,
	Krzysztof Kozlowski, Conor Dooley, Philipp Zabel, Jonathan Corbet,
	Shuah Khan, linux-iio, devicetree, linux-kernel, linux-doc,
	Mark Brown
In-Reply-To: <9abc53d0-432f-48fc-9e21-4d9a3c5e129f@gmail.com>


>>>>>    
>>>>>> But yes, I do feel that the whole feature is for aggregation so seeing
>>>>>> one device with 32 channels is the expectation here? Rather than seeing
>>>>>> two devices with 16 channels.
>>>>> Yes, I think aggregation is the whole point there... so that the IIO driver
>>>>> is multi-device-aware.
>>>> Which makes me feel that different pins per device might be possible
>>>> from an HW point of view but does not make much sense. For example, for
>>>> the buffer example I would expect LDAC to be shared between all the
>>>> devices.
>>> That is why I would still suggest the multi-dac node in the middle...
>>> the parent node can hold shared resources, while the dac children can
>>> have their own, overriding or inheriting stuff.
>>>
>> Before going down that path I'd want confirmation this is something we
>> actually think anyone will build.
>>
>> Jonathan
> 
> To directly answer your question- we currently do not have a platform that supports multi device topology with independent supplies or reset lines.
> Given that, I agree to start with the parallel wiring assumption and defer per chip resource variation under there is a solid use case. I will also drop the "adi,resolution" proposal and proceed with "adi,device-addrs" in the AD5529R binding.
> With all of the above, the proposed binding for the multi-device follow up series would look like:
> 
> 
>     dac@0 {
>         compatible = "adi,ad5529r-16";
>         reg = <0>;
>         adi,device-addrs = <0 1>;
>         reset-gpios = <&gpio0 87 GPIO_ACTIVE_LOW>;
>         vdd-supply  = <&vdd_reg>;
>         hvdd-supply = <&hvdd_reg>;
> 
>         channel@0  { reg = <0>;  adi,output-range-microvolt = <0 5000000>; };
>         channel@16 { reg = <16>; adi,output-range-microvolt = <0 40000000>; };
>     };
> 
> Does this look reasonable to everyone?
> 
> Regards,
> Janani Sunil
> 

LGTM. Seems like the simplest way to handle it and should cover most
use cases.

^ permalink raw reply

* Re: [PATCH v8 04/46] KVM: Decouple kvm_has_arch_private_mem from CONFIG_KVM_VM_MEMORY_ATTRIBUTES
From: Sean Christopherson @ 2026-06-24 15:12 UTC (permalink / raw)
  To: Ackerley Tng
  Cc: Binbin Wu, aik, andrew.jones, brauner, chao.p.peng, david,
	jmattson, jthoughton, michael.roth, oupton, pankaj.gupta, qperret,
	rick.p.edgecombe, rientjes, shivankg, steven.price, tabba, willy,
	wyihan, yan.y.zhao, forkloop, pratyush, suzuki.poulose,
	aneesh.kumar, liam, Paolo Bonzini, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Baoquan He, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <CAEvNRgGF+O7r-YHqcLp-ZgoXTCbqjuUhpOdD5eE5w2wu3YYYpw@mail.gmail.com>

On Tue, Jun 23, 2026, Ackerley Tng wrote:
> Binbin Wu <binbin.wu@linux.intel.com> writes:
> 
> > On 6/19/2026 8:31 AM, Ackerley Tng via B4 Relay wrote:
> >> From: Sean Christopherson <seanjc@google.com>
> >>
> >> When memory attributes become trackable in guest_memfd, the concept of
> >> having private memory is no longer dependent on
> >> CONFIG_KVM_VM_MEMORY_ATTRIBUTES.
> >>
> >> With this, on x86, kvm_arch_has_private_mem() is defined if some CoCo
> >> platform support (or the testing CONFIG_KVM_SW_PROTECTED_VM) is compiled
> >> in.
> >>
> >> Signed-off-by: Sean Christopherson <seanjc@google.com>
> >> Co-developed-by: Ackerley Tng <ackerleytng@google.com>
> >> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> >
> > Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
> >
> > One nit below.
> >
> >> ---
> >>  arch/x86/include/asm/kvm_host.h | 4 +++-
> >>  include/linux/kvm_host.h        | 2 +-
> >>  2 files changed, 4 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >> index 8e8eb8a5e8a6b..1bde67cf6eb0e 100644
> >> --- a/arch/x86/include/asm/kvm_host.h
> >> +++ b/arch/x86/include/asm/kvm_host.h
> >> @@ -2394,7 +2394,9 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
> >>  		       int tdp_max_root_level, int tdp_huge_page_level);
> >>
> >>
> >> -#ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES
> >> +#if defined(CONFIG_KVM_SW_PROTECTED_VM) ||	\
> >> +	defined(CONFIG_KVM_INTEL_TDX) ||	\
> >> +	defined(CONFIG_KVM_AMD_SEV)
> >
> > Nit:
> > Vertically align the defined(XXX) statements for better readability?
> >
> 
> Sean had this aligned with spaces, and checkpatch complained about

checkpatch is a tool, it is neither omniscient nor authoritative.  And for things
like this, the *entire* purpose for rules/guildlines like "no tabs after spaces"
is to help ensure the code is easier to read, e.g. doesn't end up with wonky
formatting when viewed in certain editors or whatever.  So, ignore checkpatch if
it complains about formatting that is visually superior to what makes checkpatch
happy.

> having no spaces before tabs, so I switched it to tabs instead since I
> don't think alignment like that is officially documented either way.

This exact case may not be "officially" documented, but the general gist is in
Documentation/process/maintainer-tip.rst:

  When splitting function declarations or function calls, then please align
  the first argument in the second line with the first argument in the first
  line::

And there is lots and lots of prior art on-list (from me and others) that is more
or less as good as official documentation.

> Either way is fine :)

Please restore the alignment.

^ permalink raw reply

* Re: [PATCH v3 1/2] dt-bindings: iio: dac: Add AD5529R
From: Janani Sunil @ 2026-06-24 15:01 UTC (permalink / raw)
  To: Jonathan Cameron, Rodrigo Alencar
  Cc: Nuno Sá, Conor Dooley, Janani Sunil, Lars-Peter Clausen,
	Michael Hennerich, David Lechner, Nuno Sá, Andy Shevchenko,
	Rob Herring, Krzysztof Kozlowski, Conor Dooley, Philipp Zabel,
	Jonathan Corbet, Shuah Khan, linux-iio, devicetree, linux-kernel,
	linux-doc, Mark Brown
In-Reply-To: <20260623155732.318f34f2@jic23-huawei>


On 6/23/26 16:57, Jonathan Cameron wrote:
> On Tue, 23 Jun 2026 09:09:14 +0100
> Rodrigo Alencar <455.rodrigo.alencar@gmail.com> wrote:
>
>> On 22/06/26 13:20, Nuno Sá wrote:
>>> On Mon, Jun 22, 2026 at 12:51:20PM +0100, Rodrigo Alencar wrote:
>>>> On 22/06/26 11:29, Nuno Sá wrote:
>>>>> On Mon, Jun 22, 2026 at 10:24:05AM +0100, Rodrigo Alencar wrote:
>>>>>> On 21/06/26 15:33, Jonathan Cameron wrote:
>>>>>>> On Fri, 19 Jun 2026 16:54:11 +0100
>>>>>>> Nuno Sá <noname.nuno@gmail.com> wrote:
>>>>>>>    
>>>>>>>> On Fri, Jun 19, 2026 at 03:12:07PM +0100, Conor Dooley wrote:
>>>>>>>>> On Fri, Jun 19, 2026 at 02:01:08PM +0100, Nuno Sá wrote:
>>>>>>>>>> On Fri, Jun 19, 2026 at 12:40:54PM +0100, Conor Dooley wrote:
>>>>>>>>>>> On Fri, Jun 19, 2026 at 12:36:55PM +0100, Conor Dooley wrote:
>>>>>>>>>>>> On Fri, Jun 19, 2026 at 12:33:11PM +0200, Janani Sunil wrote:
>>>>>>>>>>>>> On 6/14/26 21:44, Jonathan Cameron wrote:
>>>>>>>>>>>>>> On Tue, 9 Jun 2026 16:47:23 +0200
>>>>>>>>>>>>>> Janani Sunil <jan.sun97@gmail.com> wrote:
>>>>>>>>>>>>>>      
>>>>>>>>>>>>>>> On 5/26/26 15:11, Rodrigo Alencar wrote:
>>>>>>>>>>>>>>>> On 26/05/19 05:42PM, Janani Sunil wrote:
>>>>>>>>>>>>>>>>> Devicetree bindings for AD5529R 16 channel 12/16 bit high voltage,
>>>>>>>>>>>>>>>>> buffered voltage output digital-to-analog converter (DAC) with an
>>>>>>>>>>>>>>>>> integrated precision reference.
>>>>>>>>>>>>>>>> ...
>>>>>>>>>>>>>>>> Probably others may comment on that, but...
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> This parent node may support device addressing for multi-device support through
>>>>>>>>>>>>>>>> those ID pins. I suppose that each device may have its own power supplies or
>>>>>>>>>>>>>>>> other resources like the toggle pins or reset and enable.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> That way I suppose that an example would look like...
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +patternProperties:
>>>>>>>>>>>>>>>>> +  "^channel@([0-9]|1[0-5])$":
>>>>>>>>>>>>>>>>> +    type: object
>>>>>>>>>>>>>>>>> +    description: Child nodes for individual channel configuration
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +    properties:
>>>>>>>>>>>>>>>>> +      reg:
>>>>>>>>>>>>>>>>> +        description: Channel number.
>>>>>>>>>>>>>>>>> +        minimum: 0
>>>>>>>>>>>>>>>>> +        maximum: 15
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +      adi,output-range-microvolt:
>>>>>>>>>>>>>>>>> +        description: |
>>>>>>>>>>>>>>>>> +          Output voltage range for this channel as [min, max] in microvolts.
>>>>>>>>>>>>>>>>> +          If not specified, defaults to 0V to 5V range.
>>>>>>>>>>>>>>>>> +        oneOf:
>>>>>>>>>>>>>>>>> +          - items:
>>>>>>>>>>>>>>>>> +              - const: 0
>>>>>>>>>>>>>>>>> +              - enum: [5000000, 10000000, 20000000, 40000000]
>>>>>>>>>>>>>>>>> +          - items:
>>>>>>>>>>>>>>>>> +              - const: -5000000
>>>>>>>>>>>>>>>>> +              - const: 5000000
>>>>>>>>>>>>>>>>> +          - items:
>>>>>>>>>>>>>>>>> +              - const: -10000000
>>>>>>>>>>>>>>>>> +              - const: 10000000
>>>>>>>>>>>>>>>>> +          - items:
>>>>>>>>>>>>>>>>> +              - const: -15000000
>>>>>>>>>>>>>>>>> +              - const: 15000000
>>>>>>>>>>>>>>>>> +          - items:
>>>>>>>>>>>>>>>>> +              - const: -20000000
>>>>>>>>>>>>>>>>> +              - const: 20000000
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +    required:
>>>>>>>>>>>>>>>>> +      - reg
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +    additionalProperties: false
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +required:
>>>>>>>>>>>>>>>>> +  - compatible
>>>>>>>>>>>>>>>>> +  - reg
>>>>>>>>>>>>>>>>> +  - vdd-supply
>>>>>>>>>>>>>>>>> +  - avdd-supply
>>>>>>>>>>>>>>>>> +  - hvdd-supply
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +dependencies:
>>>>>>>>>>>>>>>>> +  spi-cpha: [ spi-cpol ]
>>>>>>>>>>>>>>>>> +  spi-cpol: [ spi-cpha ]
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +allOf:
>>>>>>>>>>>>>>>>> +  - $ref: /schemas/spi/spi-peripheral-props.yaml#
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +unevaluatedProperties: false
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +examples:
>>>>>>>>>>>>>>>>> +  - |
>>>>>>>>>>>>>>>>> +    #include <dt-bindings/gpio/gpio.h>
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +    spi {
>>>>>>>>>>>>>>>>> +        #address-cells = <1>;
>>>>>>>>>>>>>>>>> +        #size-cells = <0>;
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +        dac@0 {
>>>>>>>>>>>>>>>>> +            compatible = "adi,ad5529r-16";
>>>>>>>>>>>>>>>>> +            reg = <0>;
>>>>>>>>>>>>>>>>> +            spi-max-frequency = <25000000>;
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +            vdd-supply = <&vdd_regulator>;
>>>>>>>>>>>>>>>>> +            avdd-supply = <&avdd_regulator>;
>>>>>>>>>>>>>>>>> +            hvdd-supply = <&hvdd_regulator>;
>>>>>>>>>>>>>>>>> +            hvss-supply = <&hvss_regulator>;
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +            reset-gpios = <&gpio0 87 GPIO_ACTIVE_LOW>;
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +            #address-cells = <1>;
>>>>>>>>>>>>>>>>> +            #size-cells = <0>;
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +            channel@0 {
>>>>>>>>>>>>>>>>> +                reg = <0>;
>>>>>>>>>>>>>>>>> +                adi,output-range-microvolt = <0 5000000>;
>>>>>>>>>>>>>>>>> +            };
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +            channel@1 {
>>>>>>>>>>>>>>>>> +                reg = <1>;
>>>>>>>>>>>>>>>>> +                adi,output-range-microvolt = <(-10000000) 10000000>;
>>>>>>>>>>>>>>>>> +            };
>>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>>> +            channel@2 {
>>>>>>>>>>>>>>>>> +                reg = <2>;
>>>>>>>>>>>>>>>>> +                adi,output-range-microvolt = <0 40000000>;
>>>>>>>>>>>>>>>>> +            };
>>>>>>>>>>>>>>>>> +        };
>>>>>>>>>>>>>>>>> +    };
>>>>>>>>>>>>>>>> ...
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> 	spi {
>>>>>>>>>>>>>>>> 		#address-cells = <1>;
>>>>>>>>>>>>>>>> 		#size-cells = <0>;
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> 		multi-dac@0 {
>>>>>>>>>>>>>>>> 			compatible = "adi,ad5529r-16";
>>>>>>>>>>>>>>>> 			reg = <0>;
>>>>>>>>>>>>>>>> 			spi-max-frequency = <25000000>;
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> 			#address-cells = <1>;
>>>>>>>>>>>>>>>> 			#size-cells = <0>;
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> 			dac@0 {
>>>>>>>>>>>>>>>> 				reg = <0>;
>>>>>>>>>>>>>>>> 				vdd-supply = <&vdd_regulator>;
>>>>>>>>>>>>>>>> 				avdd-supply = <&avdd_regulator>;
>>>>>>>>>>>>>>>> 				hvdd-supply = <&hvdd_regulator>;
>>>>>>>>>>>>>>>> 				hvss-supply = <&hvss_regulator>;
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> 				reset-gpios = <&gpio0 87 GPIO_ACTIVE_LOW>;
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> 				#address-cells = <1>;
>>>>>>>>>>>>>>>> 				#size-cells = <0>;
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> 				channel@0 {
>>>>>>>>>>>>>>>> 					reg = <0>;
>>>>>>>>>>>>>>>> 					adi,output-range-microvolt = <0 5000000>;
>>>>>>>>>>>>>>>> 				};
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> 				channel@1 {
>>>>>>>>>>>>>>>> 					reg = <1>;
>>>>>>>>>>>>>>>> 					adi,output-range-microvolt = <(-10000000) 10000000>;
>>>>>>>>>>>>>>>> 				};
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> 				channel@2 {
>>>>>>>>>>>>>>>> 					reg = <2>;
>>>>>>>>>>>>>>>> 					adi,output-range-microvolt = <0 40000000>;
>>>>>>>>>>>>>>>> 				};
>>>>>>>>>>>>>>>> 			}
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> 			dac@1 {
>>>>>>>>>>>>>>>> 				reg = <1>;
>>>>>>>>>>>>>>>> 				vdd-supply = <&vdd_regulator>;
>>>>>>>>>>>>>>>> 				avdd-supply = <&avdd_regulator>;
>>>>>>>>>>>>>>>> 				hvdd-supply = <&hvdd_regulator>;
>>>>>>>>>>>>>>>> 				hvss-supply = <&hvss_regulator>;
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> 				reset-gpios = <&gpio0 88 GPIO_ACTIVE_LOW>;
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> 				#address-cells = <1>;
>>>>>>>>>>>>>>>> 				#size-cells = <0>;
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> 				channel@0 {
>>>>>>>>>>>>>>>> 					reg = <0>;
>>>>>>>>>>>>>>>> 					adi,output-range-microvolt = <0 5000000>;
>>>>>>>>>>>>>>>> 				};
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> 				channel@1 {
>>>>>>>>>>>>>>>> 					reg = <1>;
>>>>>>>>>>>>>>>> 					adi,output-range-microvolt = <(-10000000) 10000000>;
>>>>>>>>>>>>>>>> 				};
>>>>>>>>>>>>>>>> 			}
>>>>>>>>>>>>>>>> 		};
>>>>>>>>>>>>>>>> 	};
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> then you might need something like:
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> 	patternProperties:
>>>>>>>>>>>>>>>> 		"^dac@[0-3]$":
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> and put most of the things under this node pattern.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> So the main driver that you're putting together might need to handle up to four instances.
>>>>>>>>>>>>>>>> Even if your current driver cannot handle this, the dt-bindings might need cover that.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Need to double check if each dac node needs a separate compatible, so you would maybe populate
>>>>>>>>>>>>>>>> a platform data to be shared with the child nodes, which would be a separate driver.
>>>>>>>>>>>>>>>> (not sure if it would make sense to mix and match ad5529r-16 and ad5529r-12).
>>>>>>>>>>>>>>> Hi Rodrigo,
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Thank you for looking at this.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> For now, I would prefer to keep the binding scoped to a single AD5529R device instance. The current
>>>>>>>>>>>>>>> hardware/use case we have only needs one device node and the driver is written around that model as well.
>>>>>>>>>>>>>>> While the device addressing pins could allow multi-device topology, we do not have an actual platform using
>>>>>>>>>>>>>>> that configuration at the moment, so I would prefer not to introduce an extra parent/child binding structure
>>>>>>>>>>>>>>> speculatively without a validating use case.
>>>>>>>>>>>>>> Interesting feature - kind of similar to address control on a typical i2c bus device, or
>>>>>>>>>>>>>> looking at it another way a kind of distributed SPI mux.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Challenge of a binding is we need to anticipate the future.  So I think we do need something
>>>>>>>>>>>>>> like Rodrigo is suggesting even if we only (for now) support a single instance in the driver.
>>>>>>>>>>>>>> That would leave the path open to supporting the addressing at a later date.
>>>>>>>>>>>>>> An alternative might be to look at it like a chained device setup. In those we pretend there
>>>>>>>>>>>>>> is just one device with a lot of channels etc.  The snag is that here things are more loosely
>>>>>>>>>>>>>> coupled whereas for those devices it tends to be you have to read / write the same register
>>>>>>>>>>>>>> in all devices in the chain as one big SPI message.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> +CC Mark Brown as he may know of some precedence for this feature. For his reference..
>>>>>>>>>>>>>> - Each of these device has 2 ID pins.  The SPI transfers have to contain the 2 bit
>>>>>>>>>>>>>> value that matches that or they are ignored.  Thus a single bus + 1 chip select can
>>>>>>>>>>>>>> be used to talk to 4 devices.  Question is what that looks like in device tree + I guess
>>>>>>>>>>>>>> longer term how to support it cleanly in SPI.
>>>>>>>>>>>> I'd swear I have seen this before, from some Microchip devices. Let me
>>>>>>>>>>>> see if I can find what I am thinking of...
>>>>>>>>>>>
>>>>>>>>>>> microchip,mcp3911 and microchip,mcp3564 both seem to do this with
>>>>>>>>>>> slightly different properties.
>>>>>>>>>>>
>>>>>>>>>>>    microchip,device-addr:
>>>>>>>>>>>      description: Device address when multiple MCP3911 chips are present on the same SPI bus.
>>>>>>>>>>>      $ref: /schemas/types.yaml#/definitions/uint32
>>>>>>>>>>>      enum: [0, 1, 2, 3]
>>>>>>>>>>>      default: 0
>>>>>>>>>>>
>>>>>>>>>>> and
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>>    microchip,hw-device-address:
>>>>>>>>>>>      $ref: /schemas/types.yaml#/definitions/uint32
>>>>>>>>>>>      minimum: 0
>>>>>>>>>>>      maximum: 3
>>>>>>>>>>>      description:
>>>>>>>>>>>        The address is set on a per-device basis by fuses in the factory,
>>>>>>>>>>>        configured on request. If not requested, the fuses are set for 0x1.
>>>>>>>>>>>        The device address is part of the device markings to avoid
>>>>>>>>>>>        potential confusion. This address is coded on two bits, so four possible
>>>>>>>>>>>        addresses are available when multiple devices are present on the same
>>>>>>>>>>>        SPI bus with only one Chip Select line for all devices.
>>>>>>>>>>>        Each device communication starts by a CS falling edge, followed by the
>>>>>>>>>>>        clocking of the device address (BITS[7:6] - top two bits of COMMAND BYTE
>>>>>>>>>>>        which is first one on the wire).
>>>>>>>>>>>
>>>>>>>>>>> This sounds exactly like the sort of feature that you're dealing with
>>>>>>>>>>> here?
>>>>>>>>>>>      
>>>>>>>>>> The core idea yes but for this chip, things are a bit more annoying (but
>>>>>>>>>> Janani can correct me if I'm wrong). Here, each device can, in theory,
>>>>>>>>>> have it's own supplies, pins and at the very least, channels with maybe
>>>>>>>>>> different scales. That is why Janani is proposing dac nodes. Given I
>>>>>>>>>> honestly don't like much of that "adi,ad5529r-bus" compatible I wondered
>>>>>>>>>> about solving this at the spi level.
>>>>>>>>>>
>>>>>>>>>> Ah and to make it more annoying, we can also mix 12 and 16 bits variants
>>>>>>>>>> together in the same bus.
>>>>>>>>> I'm definitely missing something, because that property for the
>>>>>>>>> microchip devices is not impacted what else is on the bus. AFAICT, you
>>>>>>>>> could have an mcp3911 and an mcp3564 on the same bus even though both
>>>>>>>>> are completely different devices with different drivers. They have
>>>>>>>>> individual device nodes and their own supplies etc etc. These aren't
>>>>>>>>> per-channel properties on an adc or dac, they're per child device on a
>>>>>>>>> spi bus.
>>>>>>>> Maybe I'm the one missing something :). IIRC, spi would not allow two
>>>>>>>> devices on the same CS right? Because for this chip we would need
>>>>>>>> something like:
>>>>>>>>
>>>>>>>> spi {
>>>>>>>> 	dac@0 {
>>>>>>>> 		reg = <0>;
>>>>>>>> 		adi,pin-id = <0>;
>>>>>>>> 	};
>>>>>>>>
>>>>>>>> 	dac@1 {
>>>>>>>> 		reg = <0>; // which seems already problematic?
>>>>>>>> 		adi,pin-id <1>;
>>>>>>>> 	};
>>>>>>>>
>>>>>>>> 	...
>>>>>>>>
>>>>>>>> 	//up to 4
>>>>>>>> };
>>>>>>> Yeah. It's not clear to me how that works for the microchip devices
>>>>>>> (I suspect it doesn't!)
>>>>>>>
>>>>>>> Just thinking as I type, but could we do something a bit nasty with
>>>>>>> a gpio mux that doesn't actually switch but represents the GPIO being
>>>>>>> shared?  Given this is all tied to the spi bus that should all happen
>>>>>>> under serializing locks.
>>>>>>>
>>>>>>> Agreed though that this would be nicer as an SPI thing that let
>>>>>>> us specify that a single CS is share by multiple devices and their
>>>>>>> is some other signal acting to select which one we are talking to.
>>>>>>>    
>>>>>> If the device-addressing on the same chip-select is to be handled
>>>>>> by the spi framework, wouldn't we lose device-specific features?
>>>>>>
>>>>>> I understand that this multi-device feature is there mostly to extend the
>>>>>> channel count from 16 to 32, 48 or 64. I suppose the command:
>>>>>>
>>>>>> 	"MULTI DEVICE SW LDAC MODE"
>>>>>>
>>>>>> exists so that software can update channel values accross multiple devices.
>>>>> Right! You do have a point! I agree the main driver for a feature like
>>>>> this is likely to extend the channel count and effectively "aggregate"
>>>>> devices.
>>>>>
>>>>> But I would say that even with the spi solution the MULTI DEVICE stuff
>>>>> should be doable (as we still need a sort of adi,pin-id property).
>>>> I don't think we can have something like an IIO buffer shared by multiple
>>>> devices. Synchronizing separate devices would be doable with proper hardware
>>>> support for this (probably involving an FGPA).
>>> True!
>>>    
>>>>     
>>>>> But yes, I do feel that the whole feature is for aggregation so seeing
>>>>> one device with 32 channels is the expectation here? Rather than seeing
>>>>> two devices with 16 channels.
>>>> Yes, I think aggregation is the whole point there... so that the IIO driver
>>>> is multi-device-aware.
>>> Which makes me feel that different pins per device might be possible
>>> from an HW point of view but does not make much sense. For example, for
>>> the buffer example I would expect LDAC to be shared between all the
>>> devices.
>> That is why I would still suggest the multi-dac node in the middle...
>> the parent node can hold shared resources, while the dac children can
>> have their own, overriding or inheriting stuff.
>>
> Before going down that path I'd want confirmation this is something we
> actually think anyone will build.
>
> Jonathan

To directly answer your question- we currently do not have a platform that supports multi device topology with independent supplies or reset lines.
Given that, I agree to start with the parallel wiring assumption and defer per chip resource variation under there is a solid use case. I will also drop the "adi,resolution" proposal and proceed with "adi,device-addrs" in the AD5529R binding.
With all of the above, the proposed binding for the multi-device follow up series would look like:


     dac@0 {
         compatible = "adi,ad5529r-16";
         reg = <0>;
         adi,device-addrs = <0 1>;
         reset-gpios = <&gpio0 87 GPIO_ACTIVE_LOW>;
         vdd-supply  = <&vdd_reg>;
         hvdd-supply = <&hvdd_reg>;

         channel@0  { reg = <0>;  adi,output-range-microvolt = <0 5000000>; };
         channel@16 { reg = <16>; adi,output-range-microvolt = <0 40000000>; };
     };

Does this look reasonable to everyone?

Regards,
Janani Sunil


^ permalink raw reply

* Re: [PATCH v16 04/14] lib: kstrtox: add initial value to _parse_integer_limit()
From: Jonathan Cameron @ 2026-06-24 14:54 UTC (permalink / raw)
  To: Rodrigo Alencar
  Cc: rodrigo.alencar, linux-kernel, linux-iio, devicetree, linux-doc,
	linux, David Lechner, Andy Shevchenko, Lars-Peter Clausen,
	Michael Hennerich, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
	Jonathan Corbet, Andrew Morton, Petr Mladek, Steven Rostedt,
	Andy Shevchenko, Rasmus Villemoes, Sergey Senozhatsky, Shuah Khan
In-Reply-To: <20260614210044.19dfc8df@jic23-huawei>

On Sun, 14 Jun 2026 21:00:44 +0100
Jonathan Cameron <jic23@kernel.org> wrote:

> On Thu, 4 Jun 2026 11:09:33 +0100
> Rodrigo Alencar <455.rodrigo.alencar@gmail.com> wrote:
> 
> > On 26/06/04 10:58AM, Rodrigo Alencar via B4 Relay wrote:  
> > > From: Rodrigo Alencar <rodrigo.alencar@analog.com>
> > > 
> > > Add init parameter to _parse_integer_limit() that defines an initial
> > > value for the accumulated result when parsing an 64-bit integer. The
> > > new function prototype is adjusted so that the _parse_integer() macros
> > > stay consistent allowing for one more argument, which defaults to 0.    
> > 
> > ...
> >   
> > >  noinline
> > >  unsigned int _parse_integer_limit(const char *s, unsigned int base, unsigned long long *p,
> > > -				  size_t max_chars)
> > > +				  size_t max_chars, unsigned long long init)
> > >  {
> > >  	unsigned long long res;
> > >  	unsigned int rv;
> > >  
> > > -	res = 0;
> > > +	res = init;    
> > 
> > This might generate conflict, as the code around have changed in linux-next.
> > It is an easy fix though.
> >   
> Thanks for the heads up. Hopefully that will all fall out when I rebase testing
> on rc1 once that is out.
I've done a mid merge cycle rebase as the char-misc branches have merged.
So this should be resolve on my testing branch now.

Thanks
Jonathan

> 
> Jonathan
> 
> > >  	rv = 0;
> > >  	while (max_chars--) {
> > >  		unsigned int c = *s;    
> >   
> 
> 


^ permalink raw reply

* [PATCH v8 10/10] tracing/probes: Add a new testcase for BTF typecasts
From: Masami Hiramatsu (Google) @ 2026-06-24 14:43 UTC (permalink / raw)
  To: Steven Rostedt, Mathieu Desnoyers
  Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178231208703.732967.1160700962651040729.stgit@devnote2>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

With the introduction of container_of-style BTF typecasting and
per-CPU variable access support in trace probes, we need a way to
verify their functionality and prevent regressions.

Add a new ftrace kselftest and update the trace event sample module
to test and validate these features.

Specifically, update the trace-events-sample module to set up a
periodic timer whose callback accesses a per-CPU counter. Introduce
a new sample trace event, foo_timer_fn, to trace this callback
and log the current counter value.

Then, add a new test case, btf_probe_event.tc, which defines a
dynamic probe on the timer callback. The probe uses BTF typecasting
to recover the parent structure from the timer argument and
this_cpu_read() to fetch the per-CPU counter. The test verifies
the integrity of the implementation by ensuring the values
recorded by the dynamic probe match those from the static tracepoint.

Assisted-by: Antigravity:gemini-3.5-flash
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v8:
  - Add more test cases.
 Changes in v6:
  - Update testcase according to changes.
 Changes in v5:
  - Add more syntax test cases.
 Changes in v4:
  - Fix uprobe $current test.
 Changes in v3:
  - Add syntax test case.
  - Update testcase to use this_cpu_read()
 Changes in v2:
  - Use timer_shutdown_sync() instead of timer_delete_sync() for teardown.
---
 samples/trace_events/trace-events-sample.c         |   40 +++++++++++++++-
 samples/trace_events/trace-events-sample.h         |   34 ++++++++++++-
 .../ftrace/test.d/dynevent/btf_probe_event.tc      |   51 ++++++++++++++++++++
 .../test.d/dynevent/eprobes_syntax_errors.tc       |    3 +
 .../ftrace/test.d/dynevent/fprobe_syntax_errors.tc |   12 +++++
 .../ftrace/test.d/kprobe/kprobe_syntax_errors.tc   |   12 +++++
 .../ftrace/test.d/kprobe/uprobe_syntax_errors.tc   |    5 ++
 7 files changed, 152 insertions(+), 5 deletions(-)
 create mode 100644 tools/testing/selftests/ftrace/test.d/dynevent/btf_probe_event.tc

diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
index 0b7a6efdb247..ca5d98c360cb 100644
--- a/samples/trace_events/trace-events-sample.c
+++ b/samples/trace_events/trace-events-sample.c
@@ -94,6 +94,20 @@ static int simple_thread_fn(void *arg)
 static DEFINE_MUTEX(thread_mutex);
 static int simple_thread_cnt;
 
+static struct foo_timer_data *foo_timer_data;
+
+static void sample_timer_cb(struct timer_list *t)
+{
+	struct foo_timer_data *data = container_of(t, struct foo_timer_data, timer);
+
+	get_cpu();
+	trace_foo_timer_fn(data);
+	(*this_cpu_ptr(data->counter))++;
+	put_cpu();
+
+	mod_timer(t, jiffies + HZ);
+}
+
 int foo_bar_reg(void)
 {
 	mutex_lock(&thread_mutex);
@@ -132,9 +146,27 @@ void foo_bar_unreg(void)
 
 static int __init trace_event_init(void)
 {
+	foo_timer_data = kzalloc_obj(*foo_timer_data, GFP_KERNEL);
+	if (!foo_timer_data)
+		return -ENOMEM;
+
+	foo_timer_data->name = "sample_timer_counter";
+	foo_timer_data->counter = alloc_percpu(int);
+	if (!foo_timer_data->counter) {
+		kfree(foo_timer_data);
+		return -ENOMEM;
+	}
+
+	timer_setup(&foo_timer_data->timer, sample_timer_cb, 0);
+	mod_timer(&foo_timer_data->timer, jiffies + HZ);
+
 	simple_tsk = kthread_run(simple_thread, NULL, "event-sample");
-	if (IS_ERR(simple_tsk))
-		return -1;
+	if (IS_ERR(simple_tsk)) {
+		timer_shutdown_sync(&foo_timer_data->timer);
+		free_percpu(foo_timer_data->counter);
+		kfree(foo_timer_data);
+		return PTR_ERR(simple_tsk);
+	}
 
 	return 0;
 }
@@ -147,6 +179,10 @@ static void __exit trace_event_exit(void)
 		kthread_stop(simple_tsk_fn);
 	simple_tsk_fn = NULL;
 	mutex_unlock(&thread_mutex);
+
+	timer_shutdown_sync(&foo_timer_data->timer);
+	free_percpu(foo_timer_data->counter);
+	kfree(foo_timer_data);
 }
 
 module_init(trace_event_init);
diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h
index 1a05fc153353..816848a456a2 100644
--- a/samples/trace_events/trace-events-sample.h
+++ b/samples/trace_events/trace-events-sample.h
@@ -247,12 +247,14 @@
  */
 
 /*
- * It is OK to have helper functions in the file, but they need to be protected
- * from being defined more than once. Remember, this file gets included more
- * than once.
+ * It is OK to have helper functions and data structures in the file, but they
+ * need to be protected from being defined more than once. Remember, this file
+ * gets included more than once.
  */
 #ifndef __TRACE_EVENT_SAMPLE_HELPER_FUNCTIONS
 #define __TRACE_EVENT_SAMPLE_HELPER_FUNCTIONS
+#include <linux/timer.h>
+
 static inline int __length_of(const int *list)
 {
 	int i;
@@ -270,6 +272,13 @@ enum {
 	TRACE_SAMPLE_BAR = 4,
 	TRACE_SAMPLE_ZOO = 8,
 };
+
+struct foo_timer_data {
+	const char		*name;
+	struct timer_list	timer;
+	int __percpu		*counter;
+};
+
 #endif
 
 /*
@@ -595,6 +604,25 @@ TRACE_EVENT(foo_rel_loc,
 		  __get_rel_bitmask(bitmask),
 		  __get_rel_cpumask(cpumask))
 );
+
+TRACE_EVENT(foo_timer_fn,
+
+	TP_PROTO(struct foo_timer_data *data),
+
+	TP_ARGS(data),
+
+	TP_STRUCT__entry(
+		__string(	name,			data->name	)
+		__field(	int,			count		)
+	),
+
+	TP_fast_assign(
+		__assign_str(name);
+		__entry->count	= *this_cpu_ptr(data->counter);
+	),
+
+	TP_printk("name=%s count=%d", __get_str(name), __entry->count)
+);
 #endif
 
 /***** NOTICE! The #if protection ends here. *****/
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/btf_probe_event.tc b/tools/testing/selftests/ftrace/test.d/dynevent/btf_probe_event.tc
new file mode 100644
index 000000000000..96791e120b7d
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/btf_probe_event.tc
@@ -0,0 +1,51 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: BTF event with typecast and percpu access
+# requires: dynamic_events "this_cpu_read(<fetcharg>)":README "[(structname[,field])]<argname>[->field[->field|.field...]]":README
+
+# Check if the sample module is loaded
+if ! lsmod | grep -q trace_events_sample; then
+  modprobe trace-events-sample || exit_unsupported
+fi
+
+echo 0 > events/enable
+echo > dynamic_events
+
+# The sample_timer_cb(struct timer_list *t) is called.
+# We want to check (STRUCT,FIELD)VAR typecast and this_cpu_read() access.
+# (foo_timer_data,timer)t converts t to struct foo_timer_data * using container_of.
+# data->counter is a per-cpu pointer to int.
+# this_cpu_read(data->counter) should give the value of the counter.
+
+echo 'f:mysample/myevent sample_timer_cb name=(foo_timer_data,timer)t->name:string count=this_cpu_read((foo_timer_data,timer)t->counter)' >> dynamic_events
+
+echo 1 > events/mysample/myevent/enable
+echo 1 > events/sample-trace/foo_timer_fn/enable
+
+sleep 2
+
+echo 0 > events/mysample/myevent/enable
+echo 0 > events/sample-trace/foo_timer_fn/enable
+
+# Compare the values.
+MATCH=0
+while read line; do
+  if echo $line | grep -q "foo_timer_fn:"; then
+    NAME=`echo $line | sed 's/.*name=\([^ ]*\) .*/\1/'`
+    COUNT=`echo $line | sed 's/.*count=\([^ ]*\).*/\1/'`
+    if grep -q "myevent:.*name=\"${NAME}\" count=$COUNT" trace; then
+       MATCH=$((MATCH+1))
+    fi
+  fi
+done < trace
+
+if [ $MATCH -eq 0 ]; then
+  echo "No matching events found"
+  exit_fail
+fi
+
+# Clean up
+echo 0 > events/mysample/myevent/enable
+echo 0 > events/sample-trace/foo_timer_fn/enable
+echo > dynamic_events
+clear_trace
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc
index 0e65e787e426..ae17eb344bf7 100644
--- a/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/eprobes_syntax_errors.tc
@@ -21,6 +21,9 @@ check_error 'e:foo/^bar.1 syscalls/sys_enter_openat'	# BAD_EVENT_NAME
 
 check_error 'e:foo/bar syscalls/sys_enter_openat arg=^$foo'	# BAD_ATTACH_ARG
 
+check_error 'e:foo/bar syscalls/sys_enter_openat arg=^COMM'	# NO_EVENT_FIELD
+check_error 'e:foo/bar syscalls/sys_enter_openat arg=^current'	# NO_EVENT_FIELD
+
 if grep -q '<attached-group>\.<attached-event>.*\[if <filter>\]' README; then
   check_error 'e:foo/bar syscalls/sys_enter_openat if ^'	# NO_EP_FILTER
 fi
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_syntax_errors.tc
index fee479295e2f..e9d7e6919c7f 100644
--- a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_syntax_errors.tc
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_syntax_errors.tc
@@ -112,6 +112,18 @@ check_error 'f vfs_read%return $retval->^foo'	# NO_PTR_STRCT
 check_error 'f vfs_read file->^foo'		# NO_BTF_FIELD
 check_error 'f vfs_read file^-.foo'		# BAD_HYPHEN
 check_error 'f vfs_read ^file:string'		# BAD_TYPE4STR
+if grep -qF "[(structname" README ; then
+check_error 'f vfs_read arg1=(task_struct)file^'		# TYPECAST_REQ_FIELD
+check_error 'f vfs_read arg1=(a)((b)((c)(^(d)file->d)->c)->b)->a'	# TOO_MANY_NESTED
+check_error 'f vfs_read arg1=(task_struct,^in_execve)file->comm'	# TYPECAST_NOT_ALIGNED
+check_error 'f vfs_read arg1=(task_struct,^foo_bar)file->pid'	# NO_BTF_FIELD
+check_error 'f vfs_read arg1=(^task_struct1234)file->pid'	# NO_PTR_STRCT
+check_error 'f vfs_read arg1=(task_struct,se^->group_node)file->comm'	# TYPECAST_BAD_ARROW
+check_error 'f vfs_read arg1=(task_struct,^->pid)file->comm'	# NO_BTF_FIELD
+check_error 'f vfs_read arg1=(task_struct,^.pid)file->comm'	# NO_BTF_FIELD
+check_error 'f vfs_read arg1=(task_struct,^.)file->comm'	# NO_BTF_FIELD
+check_error 'f vfs_read arg1=(task_struct)^@symbol+10->comm'	# TYPECAST_SYM_OFFSET
+fi
 fi
 
 else
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc
index 8f1c58f0c239..21ce8414459f 100644
--- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc
@@ -115,6 +115,18 @@ check_error 'p vfs_read+20 ^$arg*'		# NOFENTRY_ARGS
 check_error 'p vfs_read ^hoge'			# NO_BTFARG
 check_error 'p kfree ^$arg10'			# NO_BTFARG (exceed the number of parameters)
 check_error 'r kfree ^$retval'			# NO_RETVAL
+if grep -qF "[(structname" README ; then
+check_error 'p vfs_read arg1=(task_struct)file^'		# TYPECAST_REQ_FIELD
+check_error 'p vfs_read arg1=(a)((b)((c)(^(d)file->d)->c)->b)->a'	# TOO_MANY_NESTED
+check_error 'p vfs_read arg1=(task_struct,^in_execve)file->comm'	# TYPECAST_NOT_ALIGNED
+check_error 'p vfs_read arg1=(task_struct,^foo_bar)file->pid'	# NO_BTF_FIELD
+check_error 'p vfs_read arg1=(^task_struct1234)file->pid'		# NO_PTR_STRCT
+check_error 'p vfs_read arg1=(task_struct,se^->group_node)file->comm'	# TYPECAST_BAD_ARROW
+check_error 'p vfs_read arg1=(task_struct,^->pid)file->comm'	# NO_BTF_FIELD
+check_error 'p vfs_read arg1=(task_struct,^.pid)file->comm'	# NO_BTF_FIELD
+check_error 'p vfs_read arg1=(task_struct,^.)file->comm'	# NO_BTF_FIELD
+check_error 'p vfs_read arg1=(task_struct)^@symbol+10->comm'	# TYPECAST_SYM_OFFSET
+fi
 else
 check_error 'p vfs_read ^$arg*'			# NOSUP_BTFARG
 fi
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/uprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/kprobe/uprobe_syntax_errors.tc
index c817158b99db..e12dc967ec76 100644
--- a/tools/testing/selftests/ftrace/test.d/kprobe/uprobe_syntax_errors.tc
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/uprobe_syntax_errors.tc
@@ -28,4 +28,9 @@ if grep -q ".*symstr.*" README; then
 check_error 'p /bin/sh:10 $stack0:^symstr'	# BAD_TYPE
 fi
 
+# $current is not supported by uprobe
+if grep -q "\$current.*" README; then
+check_error 'p /bin/sh:10 ^$current:u8'	# BAD_VAR
+fi
+
 exit 0


^ permalink raw reply related

* [PATCH v8 09/10] tracing/probes: Add this_cpu_read() and this_cpu_ptr() dereference method to fetcharg
From: Masami Hiramatsu (Google) @ 2026-06-24 14:42 UTC (permalink / raw)
  To: Steven Rostedt, Mathieu Desnoyers
  Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178231208703.732967.1160700962651040729.stgit@devnote2>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

When tracing the kernel local variables, sometimes we need to get the
CPU local variables. To access it, current simple dereference is not
enough.

Thus, introduce a special this_cpu_read() dereference to access per-cpu
variable for the current CPU (accessing other CPU variable may race with
updates on other CPUs). Also this_cpu_ptr() is for accessing per-cpu
pointer.

Those are working as same as the kernel percpu macro.

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v6:
  - Rebased on dump fetcharg patch.
  - Fix to fetch static percpu variable with @SYM correctly.
 Changes in v5:
  - Simplify this_cpu_read() into +0(this_cpu_ptr()).
 Changes in v3:
  - Remove NULL check for percpu var because it is just an offset, could be 0.
  - Simplify process_fetch_insn_bottom() code.
  - If the last operation is this_cpu_read(), read only memory of the specific
    size (of type).
 Changes in v2:
  - Drop +CPU/+PCPU and introduce this_cpu_read() and this_cpu_ptr().
  - Support these method with BTF typecast.
  - Just check the base address is NOT NULL instead of is_kernel_percpu_address().
---
 Documentation/trace/eprobetrace.rst |    2 
 Documentation/trace/fprobetrace.rst |    2 
 Documentation/trace/kprobetrace.rst |    2 
 kernel/trace/trace.c                |    1 
 kernel/trace/trace_probe.c          |  143 ++++++++++++++++++++++++++---------
 kernel/trace/trace_probe.h          |    3 -
 kernel/trace/trace_probe_tmpl.h     |   22 ++++-
 7 files changed, 130 insertions(+), 45 deletions(-)

diff --git a/Documentation/trace/eprobetrace.rst b/Documentation/trace/eprobetrace.rst
index 680e0af43d5d..279396951b34 100644
--- a/Documentation/trace/eprobetrace.rst
+++ b/Documentation/trace/eprobetrace.rst
@@ -39,6 +39,8 @@ Synopsis of eprobe_events
   @SYM[+|-offs]	: Fetch memory at SYM +|- offs (SYM should be a data symbol)
   $comm		: Fetch current task comm.
   +|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*3)(\*4)
+  this_cpu_read(FETCHARG) : Read the value of the per-CPU variable FETCHARG on the current CPU.
+  this_cpu_ptr(FETCHARG) : Get the address of the per-CPU variable FETCHARG on the current CPU.
   \IMM		: Store an immediate value to the argument.
   NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
   FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
diff --git a/Documentation/trace/fprobetrace.rst b/Documentation/trace/fprobetrace.rst
index 3392cab016b3..3439bc9bd351 100644
--- a/Documentation/trace/fprobetrace.rst
+++ b/Documentation/trace/fprobetrace.rst
@@ -52,6 +52,8 @@ Synopsis of fprobe-events
   $comm         : Fetch current task comm.
   $current      : Fetch the address of the current task_struct.
   +|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*4)(\*5)
+  this_cpu_read(FETCHARG) : Read the value of the per-CPU variable FETCHARG on the current CPU.
+  this_cpu_ptr(FETCHARG) : Get the address of the per-CPU variable FETCHARG on the current CPU.
   \IMM          : Store an immediate value to the argument.
   NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
   FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst
index 81e4fe38791d..9ae330eb0a52 100644
--- a/Documentation/trace/kprobetrace.rst
+++ b/Documentation/trace/kprobetrace.rst
@@ -55,6 +55,8 @@ Synopsis of kprobe_events
   $comm		: Fetch current task comm.
   $current      : Fetch the address of the current task_struct.
   +|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*3)(\*4)
+  this_cpu_read(FETCHARG) : Read the value of the per-CPU variable FETCHARG on the current CPU.
+  this_cpu_ptr(FETCHARG) : Get the address of the per-CPU variable FETCHARG on the current CPU.
   \IMM		: Store an immediate value to the argument.
   NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
   FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7a5676524f1a..d4121acc2938 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4332,6 +4332,7 @@ static const char readme_msg[] =
 	"\t           $stack<index>, $stack, $retval, $comm, $current\n"
 #endif
 	"\t           +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n"
+	"\t           this_cpu_read(<fetcharg>), this_cpu_ptr(<fetcharg>)\n"
 	"\t     kernel return probes support: $retval, $arg<N>, $comm\n"
 	"\t     type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n"
 	"\t           b<bit-width>@<bit-offset>/<container-size>, ustring,\n"
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index eb58b70ae082..f84a4d7d2e02 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -345,6 +345,100 @@ static int parse_trace_event(char *arg, struct fetch_insn *code,
 	return -EINVAL;
 }
 
+/* this_cpu_* parser */
+#define THIS_CPU_PTR_PREFIX "this_cpu_ptr("
+#define THIS_CPU_READ_PREFIX "this_cpu_read("
+#define THIS_CPU_PTR_LEN (sizeof(THIS_CPU_PTR_PREFIX) - 1)
+#define THIS_CPU_READ_LEN (sizeof(THIS_CPU_READ_PREFIX) - 1)
+
+static int
+parse_probe_arg(char *arg, const struct fetch_type *type,
+		struct fetch_insn **pcode, struct fetch_insn *end,
+		struct traceprobe_parse_context *ctx);
+
+/* handle dereference nested call */
+static inline int handle_dereference(char *arg, struct fetch_insn **pcode,
+	struct fetch_insn *end, struct traceprobe_parse_context *ctx,
+	int deref, long offset)
+{
+	const struct fetch_type *type = find_fetch_type(NULL, ctx->flags);
+	struct fetch_insn *code = *pcode;
+	int cur_offs = ctx->offset;
+	char *tmp;
+	int ret;
+
+	tmp = strrchr(arg, ')');
+	if (!tmp) {
+		trace_probe_log_err(ctx->offset + strlen(arg),
+					DEREF_OPEN_BRACE);
+		return -EINVAL;
+	}
+
+	*tmp = '\0';
+	ret = parse_probe_arg(arg, type, &code, end, ctx);
+	if (ret)
+		return ret;
+	ctx->offset = cur_offs;
+	if (code->op == FETCH_OP_COMM || code->op == FETCH_OP_IMMSTR) {
+		trace_probe_log_err(ctx->offset, COMM_CANT_DEREF);
+		return -EINVAL;
+	}
+
+	/*
+	 * this_cpu_ptr(@SYM) does not use SYM value, but use SYM address.
+	 * So we overwrite the last FETCH_OP_DEREF with FETCH_OP_CPU_PTR.
+	 */
+	if (!(deref == FETCH_OP_CPU_PTR && *arg == '@')) {
+		code++;
+		if (code == end) {
+			trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
+			return -EINVAL;
+		}
+	}
+	*pcode = code;
+
+	code->op = deref;
+	code->offset = offset;
+	/* Reset the last type if used */
+	ctx->last_type = NULL;
+	return 0;
+}
+
+static int parse_this_cpu(char *arg, struct fetch_insn **pcode,
+			  struct fetch_insn *end,
+			  struct traceprobe_parse_context *ctx)
+{
+	struct fetch_insn *code;
+	bool is_ptr = false;
+	int ret;
+
+	if (str_has_prefix(arg, THIS_CPU_PTR_PREFIX)) {
+		arg += THIS_CPU_PTR_LEN;
+		ctx->offset += THIS_CPU_PTR_LEN;
+		is_ptr = true;
+	} else if (str_has_prefix(arg, THIS_CPU_READ_PREFIX)) {
+		arg += THIS_CPU_READ_LEN;
+		ctx->offset += THIS_CPU_READ_LEN;
+	} else
+		return -EINVAL;
+
+	ret = handle_dereference(arg, pcode, end, ctx, FETCH_OP_CPU_PTR, 0);
+	if (ret || is_ptr)
+		return ret;
+
+	/* this_cpu_read(VAR) -> +0(this_cpu_ptr(VAR)) */
+	code = *pcode;
+	code++;
+	if (code == end) {
+		trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
+		return -EINVAL;
+	}
+	code->op = FETCH_OP_DEREF;
+	code->offset = 0;
+	*pcode = code;
+	return 0;
+}
+
 #ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
 
 static u32 btf_type_int(const struct btf_type *t)
@@ -904,11 +998,6 @@ static char *find_matched_close_paren(char *s)
 	return NULL;
 }
 
-static int
-parse_probe_arg(char *arg, const struct fetch_type *type,
-		struct fetch_insn **pcode, struct fetch_insn *end,
-		struct traceprobe_parse_context *ctx);
-
 static int handle_typecast(char *arg, struct fetch_insn **pcode,
 			   struct fetch_insn *end,
 			   struct traceprobe_parse_context *ctx)
@@ -961,7 +1050,9 @@ static int handle_typecast(char *arg, struct fetch_insn **pcode,
 		/* Skip '(' */
 		ctx->offset += 1;
 		tmp++;
-	} else if (*tmp == '+' || *tmp == '-') {
+	} else if (*tmp == '+' || *tmp == '-' ||
+		   str_has_prefix(tmp, THIS_CPU_PTR_PREFIX) ||
+		   str_has_prefix(tmp, THIS_CPU_READ_PREFIX)) {
 		/* Dereference can have another field access inside it. */
 		char *open = strchr(tmp + 1, '(');
 
@@ -1481,36 +1572,9 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
 		}
 		ctx->offset += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0);
 		arg = tmp + 1;
-		tmp = strrchr(arg, ')');
-		if (!tmp) {
-			trace_probe_log_err(ctx->offset + strlen(arg),
-					    DEREF_OPEN_BRACE);
-			return -EINVAL;
-		} else {
-			const struct fetch_type *t2 = find_fetch_type(NULL, ctx->flags);
-			int cur_offs = ctx->offset;
-
-			*tmp = '\0';
-			ret = parse_probe_arg(arg, t2, &code, end, ctx);
-			if (ret)
-				break;
-			ctx->offset = cur_offs;
-			if (code->op == FETCH_OP_COMM ||
-			    code->op == FETCH_OP_IMMSTR) {
-				trace_probe_log_err(ctx->offset, COMM_CANT_DEREF);
-				return -EINVAL;
-			}
-			if (++code == end) {
-				trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
-				return -EINVAL;
-			}
-			*pcode = code;
-
-			code->op = deref;
-			code->offset = offset;
-			/* Reset the last type if used */
-			ctx->last_type = NULL;
-		}
+		ret = handle_dereference(arg, pcode, end, ctx, deref, offset);
+		if (ret < 0)
+			return ret;
 		break;
 	case '\\':	/* Immediate value */
 		if (arg[1] == '"') {	/* Immediate string */
@@ -1531,7 +1595,10 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
 		ret = handle_typecast(arg, pcode, end, ctx);
 		break;
 	default:
-		if (isalpha(arg[0]) || arg[0] == '_') {
+		if (str_has_prefix(arg, THIS_CPU_PTR_PREFIX) ||
+		    str_has_prefix(arg, THIS_CPU_READ_PREFIX)) {
+			ret = parse_this_cpu(arg, pcode, end, ctx);
+		} else if (isalpha(arg[0]) || arg[0] == '_') {
 			/* BTF variable or event field*/
 			if (ctx->flags & TPARG_FL_TEVENT) {
 				ret = parse_trace_event(arg, *pcode, ctx);
@@ -1548,8 +1615,8 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
 				return -EINVAL;
 			}
 			ret = parse_btf_arg(arg, pcode, end, ctx);
-			break;
 		}
+		break;
 	}
 	if (!ret && code->op == FETCH_OP_NOP) {
 		/* Parsed, but do not find fetch method */
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 053f72fdaece..9955a36acbb1 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -101,6 +101,7 @@ typedef int (*print_type_func_t)(struct trace_seq *, void *, void *);
 	/* Stage 2 (dereference) ops */					\
 	FETCH_OP(DEREF, offset),	/* Dereference: .offset */	\
 	FETCH_OP(UDEREF, offset),	/* User-space dereference: .offset */\
+	FETCH_OP(CPU_PTR, none),	/* Per-CPU pointer: .offset */	\
 	/* Stage 3 (store) ops */					\
 	FETCH_OP(ST_RAW, store),	/* Raw value: .size */		\
 	FETCH_OP(ST_MEM, store),	/* Memory: .offset, .size */	\
@@ -596,7 +597,7 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
 	C(TYPECAST_NOT_EVENT,	"Typecasts are only for eprobe fields"), \
 	C(TYPECAST_REQ_FIELD,	"Typecast requires a field access"),	\
 	C(TOO_MANY_NESTED,	"Too many nested typecasts/dereferences"), \
-	C(TYPECAST_SYM_OFFSET,	"@SYM+/-OFFSET with typecast needs parentheses") \
+	C(TYPECAST_SYM_OFFSET,	"@SYM+/-OFFSET with typecast needs parentheses"), \
 	C(TYPECAST_NOT_ALIGNED,	"Typecast field option is not byte-aligned"), \
 	C(TYPECAST_BAD_ARROW,	"Typecast field option does not support -> operator"),
 
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index d0e9662cde00..8db12f758fda 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -129,25 +129,35 @@ process_fetch_insn_bottom(struct fetch_insn *code, unsigned long val,
 	struct fetch_insn *s3 = NULL;
 	int total = 0, ret = 0, i = 0;
 	u32 loc = 0;
-	unsigned long lval = val;
+	unsigned long lval, llval = val;
 
 stage2:
 	/* 2nd stage: dereference memory if needed */
 	do {
-		if (code->op == FETCH_OP_DEREF) {
-			lval = val;
+		lval = val;
+		switch (code->op) {
+		case FETCH_OP_DEREF:
 			ret = probe_mem_read(&val, (void *)val + code->offset,
 					     sizeof(val));
-		} else if (code->op == FETCH_OP_UDEREF) {
-			lval = val;
+			break;
+		case FETCH_OP_UDEREF:
 			ret = probe_mem_read_user(&val,
 				 (void *)val + code->offset, sizeof(val));
-		} else
 			break;
+		case FETCH_OP_CPU_PTR:
+			val = (unsigned long)this_cpu_ptr((void __percpu *)val);
+			ret = 0;
+			break;
+		default:
+			lval = llval;
+			goto out;
+		}
 		if (ret)
 			return ret;
+		llval = lval;
 		code++;
 	} while (1);
+out:
 
 	s3 = code;
 stage3:


^ permalink raw reply related

* [PATCH v8 08/10] tracing/probes: Add $current variable support
From: Masami Hiramatsu (Google) @ 2026-06-24 14:42 UTC (permalink / raw)
  To: Steven Rostedt, Mathieu Desnoyers
  Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178231208703.732967.1160700962651040729.stgit@devnote2>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Since we can use the BTF to cast value to a structure pointer type,
it is useful to introduce "$current" special variable support to
fetcharg.

User can define a fetcharg to access current task_struct properties
using BTF info. e.g.

  $current->cpus_ptr

Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v8:
  - Avoid uninitialized ctx->btf issue on $current without typecast.
 Changes in v7:
  - Fix to use force-typecast for task_struct implicitly.
 Changes in v6:
  - Rebased on dump fetcharg patch.
  - Remove function name/eprobe requirement for $current.
 Changes in v5:
  - Use s32 for bof_find_btf_id().
 Changes in v4:
  - Add $current in README when CONFIG_HAVE_FUNCTION_ARG_ACCESS_API=y case.
  - Fix to prohibit using $current in eprobes and address based kprobes.
 Changes in v3:
  - Remove $current support from eprobes (because eprobes is only for event)
  - Prohibit uprobes to use $current.
 Changes in v2:
   - Support to parse $current in parse_btf_arg().
   - If no typecast on $current, it automatically casted to task_struct.
   - Check error case if $current follows something except for "-".
---
 Documentation/trace/fprobetrace.rst |    1 +
 Documentation/trace/kprobetrace.rst |    1 +
 kernel/trace/trace.c                |    4 ++--
 kernel/trace/trace_probe.c          |   37 ++++++++++++++++++++++++++++++++++-
 kernel/trace/trace_probe.h          |    1 +
 kernel/trace/trace_probe_tmpl.h     |    3 +++
 6 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/Documentation/trace/fprobetrace.rst b/Documentation/trace/fprobetrace.rst
index 290a9e6f7491..3392cab016b3 100644
--- a/Documentation/trace/fprobetrace.rst
+++ b/Documentation/trace/fprobetrace.rst
@@ -50,6 +50,7 @@ Synopsis of fprobe-events
   $argN         : Fetch the Nth function argument. (N >= 1) (\*2)
   $retval       : Fetch return value.(\*3)
   $comm         : Fetch current task comm.
+  $current      : Fetch the address of the current task_struct.
   +|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*4)(\*5)
   \IMM          : Store an immediate value to the argument.
   NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst
index a62707e6a9f2..81e4fe38791d 100644
--- a/Documentation/trace/kprobetrace.rst
+++ b/Documentation/trace/kprobetrace.rst
@@ -53,6 +53,7 @@ Synopsis of kprobe_events
   $argN		: Fetch the Nth function argument. (N >= 1) (\*1)
   $retval	: Fetch return value.(\*2)
   $comm		: Fetch current task comm.
+  $current      : Fetch the address of the current task_struct.
   +|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*3)(\*4)
   \IMM		: Store an immediate value to the argument.
   NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0e36af853199..7a5676524f1a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4323,13 +4323,13 @@ static const char readme_msg[] =
 	"\t     args: <name>=fetcharg[:type]\n"
 	"\t fetcharg: (%<register>|$<efield>), @<address>, @<symbol>[+|-<offset>],\n"
 #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
-	"\t           $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
+	"\t           $stack<index>, $stack, $retval, $comm, $arg<N>, $current\n"
 #ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
 	"\t           [(structname[,field])]<argname>[->field[->field|.field...]],\n"
 	"\t           [(structname[,field])](fetcharg)->field[->field|.field...],\n"
 #endif
 #else
-	"\t           $stack<index>, $stack, $retval, $comm,\n"
+	"\t           $stack<index>, $stack, $retval, $comm, $current\n"
 #endif
 	"\t           +|-[u]<offset>(<fetcharg>), \\imm-value, \\\"imm-string\"\n"
 	"\t     kernel return probes support: $retval, $arg<N>, $comm\n"
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 2d5b2686cc15..eb58b70ae082 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -692,7 +692,9 @@ static int parse_btf_arg(char *varname,
 	int i, is_ptr, ret;
 	u32 tid;
 
-	if (!ctx->funcname && !(ctx->flags & TPARG_FL_TEVENT))
+	/* Note: field is not separated at this point, so check prefix. */
+	if (!str_has_prefix(varname, "$current") &&
+	    !ctx->funcname && !(ctx->flags & TPARG_FL_TEVENT))
 		return -EINVAL;
 
 	is_ptr = split_next_field(varname, &field, ctx);
@@ -705,6 +707,20 @@ static int parse_btf_arg(char *varname,
 		return -EOPNOTSUPP;
 	}
 
+	if (!strcmp(varname, "$current")) {
+		code->op = FETCH_OP_CURRENT;
+		/* If no typecast is specified for $current, use task_struct by default */
+		ret = bpf_find_btf_id("task_struct", BTF_KIND_STRUCT, &ctx->struct_btf);
+		if (ret < 0) {
+			trace_probe_log_err(ctx->offset, NO_BTF_ENTRY);
+			return -ENOENT;
+		}
+		tid = (u32)ret;
+		type = ctx->last_struct =
+			btf_type_skip_modifiers(ctx->struct_btf, tid, NULL);
+		goto found_type;
+	}
+
 	if (ctx->flags & TPARG_FL_RETURN && !strcmp(varname, "$retval")) {
 		code->op = FETCH_OP_RETVAL;
 		/* Check whether the function return type is not void, even with typecast. */
@@ -761,6 +777,7 @@ static int parse_btf_arg(char *varname,
 
 found:
 	type = btf_type_skip_modifiers(ctx->btf, tid, NULL);
+found_type:
 	if (!type) {
 		trace_probe_log_err(ctx->offset, BAD_BTF_TID);
 		return -EINVAL;
@@ -1270,6 +1287,24 @@ static int parse_probe_vars(char *orig_arg, const struct fetch_type *t,
 		return 0;
 	}
 
+	/* $current returns the address of the current task_struct. */
+	if (str_has_prefix(arg, "current")) {
+		/* $current is only supported by kernel probe. */
+		if (!(ctx->flags & TPARG_FL_KERNEL)) {
+			err = TP_ERR_BAD_VAR;
+			goto inval;
+		}
+		arg += strlen("current");
+		if (*arg == '-' && IS_ENABLED(CONFIG_PROBE_EVENTS_BTF_ARGS))
+			return parse_btf_arg(orig_arg, pcode, end, ctx);
+
+		if (*arg != '\0')
+			goto inval;
+
+		code->op = FETCH_OP_CURRENT;
+		return 0;
+	}
+
 #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
 	len = str_has_prefix(arg, "arg");
 	if (len) {
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index e7fcc77f51fc..053f72fdaece 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -92,6 +92,7 @@ typedef int (*print_type_func_t)(struct trace_seq *, void *, void *);
 	FETCH_OP(RETVAL, none),		/* Return value */		\
 	FETCH_OP(IMM, imm),		/* Immediate: .immediate */	\
 	FETCH_OP(COMM, none),		/* Current comm */		\
+	FETCH_OP(CURRENT, none),	/* Current task_struct address */\
 	FETCH_OP(ARG, param),		/* Argument: .param = index */	\
 	FETCH_OP(FOFFS, imm),		/* File offset: .immediate */	\
 	FETCH_OP(IMMSTR, string),	/* Allocated string: .data */	\
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index 51436f19083b..d0e9662cde00 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -112,6 +112,9 @@ process_common_fetch_insn(struct fetch_insn *code, unsigned long *val)
 	case FETCH_OP_IMMSTR:
 		*val = (unsigned long)code->data;
 		break;
+	case FETCH_OP_CURRENT:
+		*val = (unsigned long)current;
+		break;
 	default:
 		return -EILSEQ;
 	}


^ permalink raw reply related

* [PATCH v8 07/10] tracing/probes: Support field specifier option for typecast
From: Masami Hiramatsu (Google) @ 2026-06-24 14:42 UTC (permalink / raw)
  To: Steven Rostedt, Mathieu Desnoyers
  Cc: Jonathan Corbet, Shuah Khan, Masami Hiramatsu, linux-kernel,
	linux-trace-kernel, linux-doc, linux-kselftest
In-Reply-To: <178231208703.732967.1160700962651040729.stgit@devnote2>

From: Masami Hiramatsu (Google) <mhiramat@kernel.org>

Add a field specifier option for the typecast. This works like
container_of() macro.

    (STRUCT[,FIELD[.FIELD2...]])VAR

This is equivalent to :

    container_of(VAR, struct STRUCT, FIELD[.FIELD2...])

For example:

 echo "f tick_nohz_handler next_tick=(tick_sched,sched_timer)timer->next_tick" >> dynamic_events

This will trace tick_nohz_handler() with its tick_sched::next_tick which
is converted from @timer by contianer_of(tick, struct tick_sched, sched_timer).
So, if you enabkle both fprobes:tick_nohz_handler__entry and
timer:hrtimer_expire_entry events, we will see something like:


          <idle>-0       [002] d.h1.  3778.087272: hrtimer_expire_entry: hrtimer=00000000d63db328 f
unction=tick_nohz_handler now=3777450051040
          <idle>-0       [002] d.h1.  3778.087281: tick_nohz_handler__entry: (tick_nohz_handler+0x4
/0x140) next_tick=3777450000000


Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 Changes in v6:
  - Update according to the allways nested patch.
 Changes in v3:
  - Fix error caret position.
 Changes in v2:
  - Use byteoffset for typecast field offset instead of bitoffset. This fixes negative modulo calculation.
  - Check whether a field is specified after typecast.
  - Reject if typecast field option  has arrow operator.
---
 Documentation/trace/eprobetrace.rst |    5 +
 Documentation/trace/fprobetrace.rst |    8 +-
 Documentation/trace/kprobetrace.rst |    8 +-
 kernel/trace/trace.c                |    4 -
 kernel/trace/trace_probe.c          |  169 ++++++++++++++++++++++++-----------
 kernel/trace/trace_probe.h          |    5 +
 6 files changed, 135 insertions(+), 64 deletions(-)

diff --git a/Documentation/trace/eprobetrace.rst b/Documentation/trace/eprobetrace.rst
index cd0b4aa7f896..680e0af43d5d 100644
--- a/Documentation/trace/eprobetrace.rst
+++ b/Documentation/trace/eprobetrace.rst
@@ -49,7 +49,10 @@ Synopsis of eprobe_events
   (STRUCT)FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
                   a pointer to STRUCT and then derference the pointer defined by
                   ->MEMBER. Note that when this is used, the FIELD name does not
-                  need to be prefixed with a '$'.
+                  need to be prefixed with a '$'. ASGN can be specified optionally.
+		  If ASGN is specified, FIELD will be cast to the same offset
+		  position as the ASGN member, rather than to the beginning of
+		  the STRUCT.
   (STRUCT)(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
 		  also be used with another FETCHARG instead of FIELD.
 
diff --git a/Documentation/trace/fprobetrace.rst b/Documentation/trace/fprobetrace.rst
index 6b8bb27bb62d..290a9e6f7491 100644
--- a/Documentation/trace/fprobetrace.rst
+++ b/Documentation/trace/fprobetrace.rst
@@ -57,10 +57,12 @@ Synopsis of fprobe-events
                   (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types
                   (x8/x16/x32/x64), "char", "string", "ustring", "symbol", "symstr"
                   and bitfield are supported.
-  (STRUCT)FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
+  (STRUCT[,ASGN])FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
                   a pointer to STRUCT and then derference the pointer defined by
-                  ->MEMBER.
-  (STRUCT)(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
+                  ->MEMBER. ASGN can be specified optionally. If ASGN is specified,
+		  FIELD will be cast to the same offset position as the ASGN member,
+		  rather than to the beginning of the STRUCT.
+  (STRUCT[,ASGN])(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
                  also be used with another FETCHARG instead of FIELD.
 
   (\*1) This is available only when BTF is enabled.
diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst
index c4382765d5b2..a62707e6a9f2 100644
--- a/Documentation/trace/kprobetrace.rst
+++ b/Documentation/trace/kprobetrace.rst
@@ -61,11 +61,13 @@ Synopsis of kprobe_events
 		  (x8/x16/x32/x64), VFS layer common type(%pd/%pD), "char",
                   "string", "ustring", "symbol", "symstr" and bitfield are
                   supported.
-  (STRUCT)FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
+  (STRUCT[,ASGN])FIELD->MEMBER[->MEMBER] : If BTF is supported, typecast FIELD to
                   a pointer to STRUCT and then derference the pointer defined by
                   ->MEMBER. Note that this is available only when the probe is
-		   on function entry.
-  (STRUCT)(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
+		   on function entry. ASGN can be specified optionally. If ASGN
+		   is specified, FIELD will be cast to the same offset position
+		   as the ASGN member, rather than to the beginning of the STRUCT.
+  (STRUCT[,ASGN])(FETCHARG)->MEMBER[->MEMBER] : typecast can nest, so the above can
                  also be used with another FETCHARG instead of FIELD.
 
   (\*1) only for the probe on function entry (offs == 0). Note, this argument access
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4f70318918c2..0e36af853199 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4325,8 +4325,8 @@ static const char readme_msg[] =
 #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
 	"\t           $stack<index>, $stack, $retval, $comm, $arg<N>,\n"
 #ifdef CONFIG_PROBE_EVENTS_BTF_ARGS
-	"\t           [(structname)]<argname>[->field[->field|.field...]],\n"
-	"\t           [(structname)](fetcharg)->field[->field|.field...],\n"
+	"\t           [(structname[,field])]<argname>[->field[->field|.field...]],\n"
+	"\t           [(structname[,field])](fetcharg)->field[->field|.field...],\n"
 #endif
 #else
 	"\t           $stack<index>, $stack, $retval, $comm,\n"
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 87a2bb1cd950..2d5b2686cc15 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -568,6 +568,64 @@ static int split_next_field(char *varname, char **next_field,
 	return ret;
 }
 
+/* Inner loop for solving dot operator ('.'). Return bit-offset of the given field */
+static int get_bitoffset_of_field(char **pfieldname, const struct btf_type **ptype,
+				  struct traceprobe_parse_context *ctx)
+{
+	const struct btf_type *type = *ptype;
+	const struct btf_member *field;
+	struct btf *btf = ctx_btf(ctx);
+	char *fieldname = *pfieldname;
+	int bitoffs = 0;
+	u32 anon_offs;
+	char *next;
+	int is_ptr;
+
+	do {
+		next = NULL;
+		is_ptr = split_next_field(fieldname, &next, ctx);
+		if (is_ptr < 0)
+			return is_ptr;
+
+		anon_offs = 0;
+		field = btf_find_struct_member(btf, type, fieldname,
+						&anon_offs);
+		if (IS_ERR(field)) {
+			trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+			return PTR_ERR(field);
+		}
+		if (!field) {
+			trace_probe_log_err(ctx->offset, NO_BTF_FIELD);
+			return -ENOENT;
+		}
+		/* Add anonymous structure/union offset */
+		bitoffs += anon_offs;
+
+		/* Accumulate the bit-offsets of the dot-connected fields */
+		if (btf_type_kflag(type)) {
+			bitoffs += BTF_MEMBER_BIT_OFFSET(field->offset);
+			ctx->last_bitsize = BTF_MEMBER_BITFIELD_SIZE(field->offset);
+		} else {
+			bitoffs += field->offset;
+			ctx->last_bitsize = 0;
+		}
+
+			type = btf_type_skip_modifiers(btf, field->type, NULL);
+			if (!type) {
+				trace_probe_log_err(ctx->offset, BAD_BTF_TID);
+				return -EINVAL;
+			}
+
+		if (next)
+			ctx->offset += next - fieldname;
+		fieldname = next;
+	} while (!is_ptr && fieldname);
+
+	*pfieldname = fieldname;
+	*ptype = type;
+
+	return bitoffs;
+}
 /*
  * Parse the field of data structure. The @type must be a pointer type
  * pointing the target data structure type.
@@ -577,15 +635,13 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
 			   struct traceprobe_parse_context *ctx)
 {
 	struct fetch_insn *code = *pcode;
-	const struct btf_member *field;
-	u32 bitoffs, anon_offs;
-	bool is_struct = ctx->struct_btf != NULL;
 	struct btf *btf = ctx_btf(ctx);
-	char *next;
-	int is_ptr;
+	bool is_first_field = true;
+	int bitoffs;
 
 	do {
-		if (!is_struct) {
+		/* For the first field of typecast, @type will be the target structure type. */
+		if (!(is_first_field && ctx->struct_btf)) {
 			/* Outer loop for solving arrow operator ('->') */
 			if (BTF_INFO_KIND(type->info) != BTF_KIND_PTR) {
 				trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
@@ -599,60 +655,25 @@ static int parse_btf_field(char *fieldname, const struct btf_type *type,
 				return -EINVAL;
 			}
 		}
-		/* Only the first type can skip being a pointer */
-		is_struct = false;
-
-		bitoffs = 0;
-		do {
-			/* Inner loop for solving dot operator ('.') */
-			next = NULL;
-			is_ptr = split_next_field(fieldname, &next, ctx);
-			if (is_ptr < 0)
-				return is_ptr;
-
-			anon_offs = 0;
-			field = btf_find_struct_member(btf, type, fieldname,
-						       &anon_offs);
-			if (IS_ERR(field)) {
-				trace_probe_log_err(ctx->offset, BAD_BTF_TID);
-				return PTR_ERR(field);
-			}
-			if (!field) {
-				trace_probe_log_err(ctx->offset, NO_BTF_FIELD);
-				return -ENOENT;
-			}
-			/* Add anonymous structure/union offset */
-			bitoffs += anon_offs;
-
-			/* Accumulate the bit-offsets of the dot-connected fields */
-			if (btf_type_kflag(type)) {
-				bitoffs += BTF_MEMBER_BIT_OFFSET(field->offset);
-				ctx->last_bitsize = BTF_MEMBER_BITFIELD_SIZE(field->offset);
-			} else {
-				bitoffs += field->offset;
-				ctx->last_bitsize = 0;
-			}
-
-			type = btf_type_skip_modifiers(btf, field->type, NULL);
-			if (!type) {
-				trace_probe_log_err(ctx->offset, BAD_BTF_TID);
-				return -EINVAL;
-			}
-
-			ctx->offset += next - fieldname;
-			fieldname = next;
-		} while (!is_ptr && fieldname);
 
+		bitoffs = get_bitoffset_of_field(&fieldname, &type, ctx);
+		if (bitoffs < 0)
+			return bitoffs;
 		if (++code == end) {
 			trace_probe_log_err(ctx->offset, TOO_MANY_OPS);
 			return -EINVAL;
 		}
 		code->op = FETCH_OP_DEREF;	/* TODO: user deref support */
 		code->offset = bitoffs / 8;
+		if (is_first_field && ctx->struct_btf) {
+			/* The first field can be typecasted with field option. */
+			code->offset -= ctx->prefix_byteoffs;
+		}
 		*pcode = code;
 
 		ctx->last_bitoffs = bitoffs % 8;
 		ctx->last_type = type;
+		is_first_field = false;
 	} while (fieldname);
 
 	return 0;
@@ -808,6 +829,46 @@ static int query_btf_struct(const char *sname, struct traceprobe_parse_context *
 	return 0;
 }
 
+static int parse_btf_casttype(char *casttype, struct traceprobe_parse_context *ctx)
+{
+	char *field;
+	int ret;
+
+	/* Field option - evaluated later. */
+	field = strchr(casttype, ',');
+	if (field)
+		*field++ = '\0';
+
+	ret = query_btf_struct(casttype, ctx);
+	if (ret < 0) {
+		trace_probe_log_err(ctx->offset, NO_PTR_STRCT);
+		return -EINVAL;
+	}
+
+	if (field) {
+		struct btf_type *type = (struct btf_type *)ctx->last_struct;
+
+		ctx->offset += field - casttype;
+		ret = get_bitoffset_of_field(&field, &ctx->last_struct, ctx);
+		if (ret < 0)
+			return ret;
+		if (ret % 8) {
+			trace_probe_log_err(ctx->offset, TYPECAST_NOT_ALIGNED);
+			return -EINVAL;
+		}
+		if (field != NULL) {
+			/* this means @field skips an arrow operator ("->"). */
+			trace_probe_log_err(ctx->offset - 2, TYPECAST_BAD_ARROW);
+			return -EINVAL;
+		}
+		ctx->prefix_byteoffs = ret / 8;
+		/* Restore the original struct type (overwritten by get_bitoffset_of_field) */
+		ctx->last_struct = type;
+	}
+
+	return ret;
+}
+
 /* Find the matching closing parenthesis for a given opening parenthesis. */
 static char *find_matched_close_paren(char *s)
 {
@@ -940,14 +1001,14 @@ static int handle_typecast(char *arg, struct fetch_insn **pcode,
 		tmp = close + 2; /* Skip ">" after inner variable name */
 
 	/* resolve the typecast struct name */
-	ret = query_btf_struct(arg + 1, ctx);
-	if (ret < 0) {
-		trace_probe_log_err(orig_offset + 1, NO_PTR_STRCT);
-		return -EINVAL;
-	}
+	ctx->offset = orig_offset + 1; /* for the '(' */
+	ret = parse_btf_casttype(arg + 1, ctx);
+	if (ret < 0)
+		return ret;
 
 	ctx->offset = orig_offset + tmp - arg;
 	ret = parse_btf_field(tmp, ctx->last_struct, pcode, end, ctx);
+	ctx->prefix_byteoffs = 0;
 	return ret;
 }
 
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index f4fbe3010978..e7fcc77f51fc 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -451,6 +451,7 @@ struct traceprobe_parse_context {
 	unsigned int flags;
 	int offset;
 	int nested_level;
+	int prefix_byteoffs;	/* The byte offset of the prefix field of typecast */
 };
 
 /* Each typecast consumes nested level. So the max number of typecast is 3. */
@@ -594,7 +595,9 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call,
 	C(TYPECAST_NOT_EVENT,	"Typecasts are only for eprobe fields"), \
 	C(TYPECAST_REQ_FIELD,	"Typecast requires a field access"),	\
 	C(TOO_MANY_NESTED,	"Too many nested typecasts/dereferences"), \
-	C(TYPECAST_SYM_OFFSET,	"@SYM+/-OFFSET with typecast needs parentheses")
+	C(TYPECAST_SYM_OFFSET,	"@SYM+/-OFFSET with typecast needs parentheses") \
+	C(TYPECAST_NOT_ALIGNED,	"Typecast field option is not byte-aligned"), \
+	C(TYPECAST_BAD_ARROW,	"Typecast field option does not support -> operator"),
 
 #undef C
 #define C(a, b)		TP_ERR_##a


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox