Linux cryptographic layer development
 help / color / mirror / Atom feed
From: "Kalra, Ashish" <ashish.kalra@amd.com>
To: Ackerley Tng <ackerleytng@google.com>,
	tglx@kernel.org, mingo@redhat.com, bp@alien8.de,
	dave.hansen@linux.intel.com, x86@kernel.org, hpa@zytor.com,
	seanjc@google.com, peterz@infradead.org, thomas.lendacky@amd.com,
	herbert@gondor.apana.org.au, davem@davemloft.net,
	ardb@kernel.org
Cc: pbonzini@redhat.com, aik@amd.com, Michael.Roth@amd.com,
	KPrateek.Nayak@amd.com, Tycho.Andersen@amd.com,
	Nathan.Fontenot@amd.com, jackyli@google.com, pgonda@google.com,
	rientjes@google.com, jacobhxu@google.com, xin@zytor.com,
	pawan.kumar.gupta@linux.intel.com, babu.moger@amd.com,
	dyoung@redhat.com, nikunj@amd.com, john.allen@amd.com,
	darwi@linutronix.de, linux-kernel@vger.kernel.org,
	linux-crypto@vger.kernel.org, kvm@vger.kernel.org,
	linux-coco@lists.linux.dev
Subject: Re: [PATCH v5 4/7] x86/sev: Add support to perform RMP optimizations asynchronously
Date: Thu, 28 May 2026 18:52:27 -0500	[thread overview]
Message-ID: <8b7f6c93-ad5a-45e1-aa70-945518d29ddc@amd.com> (raw)
In-Reply-To: <CAEvNRgGfyb7zvZ1u1j7YLomD+JdAxnVW36gtvNG9gxgZ80vMyQ@mail.gmail.com>

Hello Ackerley,

On 5/28/2026 9:45 AM, Ackerley Tng wrote:
> Ashish Kalra <Ashish.Kalra@amd.com> writes:
> 
> Thank you Ashish!
> 
>> From: Ashish Kalra <ashish.kalra@amd.com>
>>
>> When SEV-SNP is enabled, all writes to memory are checked to ensure
>> integrity of SNP guest memory. This imposes performance overhead on the
>> whole system.
>>
>> RMPOPT is a new instruction that minimizes the performance overhead of
>> RMP checks on the hypervisor and on non-SNP guests by allowing RMP
>> checks to be skipped for 1GB regions of memory that are known not to
>> contain any SEV-SNP guest memory.
>>
>> Add support for performing RMP optimizations asynchronously using a
>> dedicated workqueue.
>>
>> Enable RMPOPT optimizations globally for all system RAM up to 2TB at
> 
> This should also be updated to say "Enable RMPOPT optimizations for up
> to 2TB worth of system RAM at..."
> 
> The current phrasing sounds like only addresses [0, 2TB) are allowed to
> be optimized, but actually any address [start, start + 2TB) can be
> optimized?

Yes, i will update it.

> 
>> RMP initialization time. RMP checks can initially be skipped for 1GB
>> memory ranges that do not contain SEV-SNP guest memory (excluding
>> preassigned pages such as the RMP table and firmware pages). As SNP
>> guests are launched, RMPUPDATE will disable the corresponding RMPOPT
>> optimizations.
>>
>> Suggested-by: Thomas Lendacky <thomas.lendacky@amd.com>
>> Suggested-by: Dave Hansen <dave.hansen@linux.intel.com>
>> Reviewed-by: Ackerley Tng <ackerleytng@google.com>
>> Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
>> ---
>>  arch/x86/virt/svm/sev.c | 167 +++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 164 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
>> index 82f9dc7a57c3..8876cac052d5 100644
>> --- a/arch/x86/virt/svm/sev.c
>> +++ b/arch/x86/virt/svm/sev.c
>> @@ -19,6 +19,7 @@
>>  #include <linux/iommu.h>
>>  #include <linux/amd-iommu.h>
>>  #include <linux/nospec.h>
>> +#include <linux/workqueue.h>
>>
>>  #include <asm/sev.h>
>>  #include <asm/processor.h>
>> @@ -125,7 +126,18 @@ static void *rmp_bookkeeping __ro_after_init;
>>  static u64 probed_rmp_base, probed_rmp_size;
>>
>>  static cpumask_t rmpopt_cpumask;
>> -static phys_addr_t rmpopt_pa_start;
>> +static phys_addr_t rmpopt_pa_start, rmpopt_pa_end;
>> +
>> +enum rmpopt_function {
>> +	RMPOPT_FUNC_VERIFY_AND_REPORT_STATUS,
>> +	RMPOPT_FUNC_REPORT_STATUS
>> +};
>> +
>> +#define RMPOPT_WORK_TIMEOUT	10000
>> +
>> +static struct workqueue_struct *rmpopt_wq;
>> +static struct delayed_work rmpopt_delayed_work;
>> +static DEFINE_MUTEX(rmpopt_wq_mutex);
>>
>>  static LIST_HEAD(snp_leaked_pages_list);
>>  static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
>> @@ -564,12 +576,21 @@ EXPORT_SYMBOL_FOR_MODULES(snp_prepare, "ccp");
>>
>>  static void rmpopt_cleanup(void)
>>  {
>> +	guard(mutex)(&rmpopt_wq_mutex);
>> +
>> +	if (!rmpopt_wq)
>> +		return;
>> +
>> +	cancel_delayed_work_sync(&rmpopt_delayed_work);
>> +	destroy_workqueue(rmpopt_wq);
>> +
>>  	cpus_read_lock();
>>  	wrmsrq_on_cpus(&rmpopt_cpumask, MSR_AMD64_RMPOPT_BASE, 0);
>>  	cpus_read_unlock();
>>
>>  	cpumask_clear(&rmpopt_cpumask);
>> -	rmpopt_pa_start = 0;
>> +	rmpopt_pa_start = rmpopt_pa_end = 0;
>> +	rmpopt_wq = NULL;
>>  }
>>
>>  void snp_shutdown(void)
>> @@ -587,6 +608,105 @@ void snp_shutdown(void)
>>  }
>>  EXPORT_SYMBOL_FOR_MODULES(snp_shutdown, "ccp");
>>
>> +static inline bool __rmpopt(u64 rax, u64 rcx)
> 
> Perhaps use pa_start instead of rax and op_type for rcx?
> 

I used these parameters to align with the RMPOPT specifications (rax and rcx)
which i think makes more sense.

>> +{
>> +	bool optimized;
>> +
>> +	asm volatile(".byte 0xf2, 0x0f, 0x01, 0xfc"
>> +		     : "=@ccc" (optimized)
>> +		     : "a" (rax), "c" (rcx)
>> +		     : "memory", "cc");
>> +
>> +	return optimized;
>> +}
>> +
>> +static void rmpopt(u64 pa)
>> +{
>> +	u64 rax = ALIGN_DOWN(pa, SZ_1G);
>> +	u64 rcx = RMPOPT_FUNC_VERIFY_AND_REPORT_STATUS;
>> +
> 
> And pa_start and op_type here too.
> 
>> +	__rmpopt(rax, rcx);
>> +}
>> +
>> +/*
>> + * 'val' is a system physical address.
>> + */
>> +static void rmpopt_smp(void *val)
>> +{
>> +	rmpopt((u64)val);
>> +}
>> +
>> +/*
>> + * RMPOPT optimizations skip RMP checks at 1GB granularity if this
>> + * range of memory does not contain any SNP guest memory.
>> + */
>> +static void rmpopt_work_handler(struct work_struct *work)
>> +{
>> +	bool current_cpu_cleared = false;
>> +	phys_addr_t pa;
>> +	int this_cpu;
>> +
>> +	pr_info("Attempt RMP optimizations on physical address range @1GB alignment [0x%016llx - 0x%016llx]\n",
>> +		rmpopt_pa_start, rmpopt_pa_end);
>> +
>> +	/*
>> +	 * RMPOPT scans the RMP table, stores the result of the scan in the
>> +	 * reserved processor memory. The RMP scan is the most expensive
>> +	 * part. If a second RMPOPT occurs, it can skip the expensive scan
>> +	 * if they can see a cached result in the reserved processor memory.
>> +	 *
>> +	 * Do RMPOPT on one CPU alone. Then, follow that up with RMPOPT
>> +	 * on every other primary thread. This potentially allows the
> 
> I like the leader and follower comments below, thanks! With this
> leader/follower setup, will the followers definitely see the cached scan
> results, or might the followers still potentially not benefit from the
> caching? If it's still only "potentially", why?

I am verifying with the H/W architects if this is always going to be true or not,
will the followers always benefit from the scan results cached by the leader (first CPU)
or there is a possibility that the followers cannot see/access/get the cached results
and instead do full RMP scanning ?

> 
>> +	 * followers to use the "cached" scan results to avoid repeating
>> +	 * full scans.
>> +	 */
>> +
>> +	/*
>> +	 * Pin the worker to the current CPU for the leader loop so that
>> +	 * this_cpu remains valid and the RMPOPT instruction executes on
>> +	 * the CPU that was cleared from the cpumask.  The workqueue is
>> +	 * WQ_UNBOUND, so without pinning, the scheduler could migrate
>> +	 * the worker between the cpumask manipulation and the leader
>> +	 * loop, causing the leader to run on a different CPU while
>> +	 * this_cpu's core is skipped entirely.
>> +	 *
>> +	 * Use migrate_disable() rather than get_cpu() to prevent
>> +	 * migration while still allowing preemption.
>> +	 *
>> +	 * Note: rmpopt_cpumask is modified here without holding
>> +	 * rmpopt_wq_mutex.  This is safe because the delayed_work
>> +	 * mechanism guarantees single-threaded execution of this
>> +	 * handler, and rmpopt_cleanup() calls cancel_delayed_work_sync()
>> +	 * to ensure handler completion before tearing down the cpumask.
>> +	 */
>> +	migrate_disable();
>> +	this_cpu = smp_processor_id();
>> +	if (cpumask_test_cpu(this_cpu, &rmpopt_cpumask)) {
>> +		cpumask_clear_cpu(this_cpu, &rmpopt_cpumask);
>> +		current_cpu_cleared = true;
>> +	}
>> +
> 
> Instead of reusing the global rmpopt_cpumask, why not make a copy of
> rmpopt_cpumask for this function? Then this function won't have to
> figure out current_cpu_cleared or restore rmpopt_cpumask at the end.
> 
> I'm thinking to also drop the test and clear, this function can just
> always clear, like
> 
>   cpumask_clear_cpu(smp_processor_id(), followers_cpumask);
> 
> and later
> 
>   on_each_cpu_mask(&followers_cpumask, ...);

That's surely a much cleaner approach. Instead of modifying global
rmpopt_cpumask and using a local copy:

  cpumask_var_t follower_mask;
  alloc_cpumask_var(&follower_mask, GFP_KERNEL);
  cpumask_copy(follower_mask, &rmpopt_cpumask);

  migrate_disable();
  this_cpu = smp_processor_id();
  cpumask_clear_cpu(this_cpu, follower_mask);  // modify local only

  // leader loop on this_cpu...
  migrate_enable();

  // follower loop with follower_mask...
  on_each_cpu_mask(follower_mask, rmpopt_smp, ...);

  free_cpumask_var(follower_mask);

This eliminates:
- current_cpu_cleared variable
- The restore at the end
  
Additionally, the global rmpopt_cpumask is never modified, so no concurrency concerns with debugfs or other readers.

> 
> Actually, if for whatever reason cpumask_test_cpu(this_cpu,
> &rmpopt_cpumask) above returns false, would that mean somehow some cpu
> exists that wasn't enabled right when rmpopt was initialized? 

The work handler can always execute on a cpu which is not in the rmpopt_cpumask, so i believe the
cpumask_test_cpu() needs to be there.

The leader loop must only run on a CPU that has RMPOPT_BASE MSR programmed. If the WQ_UNBOUND scheduler puts the
handler on a CPU not in rmpopt_cpumask, that CPU's core never had RMPOPT enabled -> RMPOPT instruction causes #UD.

  So the leader should be conditional:

  cpumask_copy(follower_mask, &rmpopt_cpumask);

  migrate_disable();
  this_cpu = smp_processor_id();

  if (cpumask_test_cpu(this_cpu, follower_mask)) {
      cpumask_clear_cpu(this_cpu, follower_mask);

      /* Leader: prime the RMPOPT cache on this CPU */
      for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
          rmpopt(pa);
  }

  migrate_enable();

  /* Followers: run RMPOPT on remaining cores */
  cpus_read_lock();
  for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
      on_each_cpu_mask(follower_mask, rmpopt_smp, (void *)pa, true);
      cond_resched();
  }
  cpus_read_unlock();

If the current CPU isn't in rmpopt_cpumask, the leader is skipped and all cores run as followers — they lose the caching
optimization from a leader priming pass, but correctness is maintained.

Alternatively, i could pick the first CPU from rmpopt_cpumask as the explicit leader instead of relying on whichever CPU the
scheduler chose.

 	cpumask_copy(follower_mask, &rmpopt_cpumask);

        migrate_disable();
        this_cpu = smp_processor_id();

        if (cpumask_test_cpu(this_cpu, follower_mask)) {
                /* Fast path: leader runs locally, no IPIs */
                cpumask_clear_cpu(this_cpu, follower_mask);

                for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
                        rmpopt(pa);
        } else {
                /*
                 * Current CPU does not have RMPOPT_BASE MSR programmed.
                 * Pick an explicit leader from the cpumask to avoid #UD.
                 */
                int leader_cpu = cpumask_first(follower_mask);

                cpumask_clear_cpu(leader_cpu, follower_mask);

                for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
                        smp_call_function_single(leader_cpu, rmpopt_smp,
                                                 (void *)pa, true);
        }

        migrate_enable();

        /* Followers: run RMPOPT on remaining cores */
        cpus_read_lock();
        for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
                on_each_cpu_mask(follower_mask, rmpopt_smp,
                                 (void *)pa, true);

                /* Give a chance for other threads to run */
                cond_resched();
        }
        cpus_read_unlock();

        free_cpumask_var(follower_mask);
  }

> If yes, what happens if we call rmpopt() on a cpu where it wasn't initialized?

That will cause a #UD exception, if RMPOPT instruction is issued on 
a CPU where RMPOPT is not enabled (RMPOPT_BASE.RMPOPT_EN==0), so
it is essential to issue RMPOPT instruction only on the cpumask (covers both
primary and secondary threads) which was setup initially when rmpopt was
initialized and on which the RMPOPT_BASE MSR was setup and RMPOPT enabled.

I believe, there are actually three cases to be considered here: 

  1. Current CPU is in rmpopt_cpumask -> primary thread, run leader locally, remove from followers
  2. Current CPU's sibling is in rmpopt_cpumask -> sibling thread, RMPOPT_BASE per-core is programmed, run leader locally, 
     remove the sibling's primary from the follower mask
  3. Neither -> new/unknown CPU, RMPOPT_BASE never programmed on this core, fall back to explicit leader via IPI.

So this seems to the *correct* implementation of the RMPOPT loop: 

   	cpumask_copy(follower_mask, &rmpopt_cpumask);

        migrate_disable();
        this_cpu = smp_processor_id();

        if (cpumask_test_cpu(this_cpu, follower_mask)) {
                /*
                 * Current CPU is a primary thread in rmpopt_cpumask.
                 * Run leader locally and remove from follower mask.
                 */
                cpumask_clear_cpu(this_cpu, follower_mask);

                for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
                        rmpopt(pa);
        } else if (cpumask_intersects(topology_sibling_cpumask(this_cpu),
                                      follower_mask)) {
                /*
                 * Current CPU is a sibling thread whose primary is in
                 * rmpopt_cpumask.  RMPOPT_BASE MSR is per-core, so it
                 * is safe to run the leader locally.  Remove the sibling's
                 * primary from the follower mask as this core is already
                 * covered by the leader.
                 */
                cpumask_andnot(follower_mask, follower_mask,
                               topology_sibling_cpumask(this_cpu));

                for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
                        rmpopt(pa);
        } else {
                /*
                 * Current CPU's core does not have RMPOPT_BASE MSR
                 * programmed.  Pick an explicit leader from the cpumask
                 * to avoid #UD.
                 */
                int leader_cpu = cpumask_first(follower_mask);

                cpumask_clear_cpu(leader_cpu, follower_mask);

                for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
                        smp_call_function_single(leader_cpu, rmpopt_smp,
                                                 (void *)pa, true);
        }

        migrate_enable();

        /* Followers: run RMPOPT on remaining cores */
        cpus_read_lock();
        for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
                on_each_cpu_mask(follower_mask, rmpopt_smp,
                                 (void *)pa, true);

                /* Give a chance for other threads to run */
                cond_resched();
        }
        cpus_read_unlock();

        free_cpumask_var(follower_mask);
  }

Thanks,
Ashish

> 
>> +	/* Leader: prime the RMPOPT cache on this CPU */
>> +	for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
>> +		rmpopt(pa);
>> +
>> +	migrate_enable();
>> +
>> +	/* Followers: run RMPOPT on all other cores */
>> +	cpus_read_lock();
>> +	for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
>> +		on_each_cpu_mask(&rmpopt_cpumask, rmpopt_smp,
>> +				 (void *)pa, true);
>> +
>> +		 /* Give a chance for other threads to run */
>> +		cond_resched();
>> +	}
>> +	cpus_read_unlock();
>> +
>> +	if (current_cpu_cleared)
>> +		cpumask_set_cpu(this_cpu, &rmpopt_cpumask);
>> +}
>> +
>>
>> [...snip...]
>>

  reply	other threads:[~2026-05-28 23:52 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-18 21:41 [PATCH v5 0/7] Add RMPOPT support Ashish Kalra
2026-05-18 21:41 ` [PATCH v5 1/7] x86/cpufeatures: Add X86_FEATURE_AMD_RMPOPT feature flag Ashish Kalra
2026-05-27 20:17   ` Borislav Petkov
2026-05-18 21:42 ` [PATCH v5 2/7] x86/msr: add wrmsrq_on_cpus helper Ashish Kalra
2026-05-18 22:04   ` Dave Hansen
2026-05-18 22:09     ` Kalra, Ashish
2026-05-18 22:38       ` Dave Hansen
2026-05-27 21:06   ` Borislav Petkov
2026-05-27 21:38     ` Dave Hansen
2026-05-28  0:43       ` Borislav Petkov
2026-05-28 19:37         ` Kalra, Ashish
2026-05-28 19:50           ` Dave Hansen
2026-05-28 19:55             ` Kalra, Ashish
2026-05-29  0:26               ` Borislav Petkov
2026-05-29  0:29                 ` Kalra, Ashish
2026-05-18 21:42 ` [PATCH v5 3/7] x86/sev: Initialize RMPOPT configuration MSRs Ashish Kalra
2026-05-18 21:42 ` [PATCH v5 4/7] x86/sev: Add support to perform RMP optimizations asynchronously Ashish Kalra
2026-05-28 14:45   ` Ackerley Tng
2026-05-28 23:52     ` Kalra, Ashish [this message]
2026-06-01 18:03       ` Kalra, Ashish
2026-05-18 21:43 ` [PATCH v5 5/7] x86/sev: Add interface to re-enable RMP optimizations Ashish Kalra
2026-05-18 21:43 ` [PATCH v5 6/7] KVM: SEV: Perform RMP optimizations on SNP guest shutdown Ashish Kalra
2026-05-18 21:43 ` [PATCH v5 7/7] x86/sev: Add debugfs support for RMPOPT Ashish Kalra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=8b7f6c93-ad5a-45e1-aa70-945518d29ddc@amd.com \
    --to=ashish.kalra@amd.com \
    --cc=KPrateek.Nayak@amd.com \
    --cc=Michael.Roth@amd.com \
    --cc=Nathan.Fontenot@amd.com \
    --cc=Tycho.Andersen@amd.com \
    --cc=ackerleytng@google.com \
    --cc=aik@amd.com \
    --cc=ardb@kernel.org \
    --cc=babu.moger@amd.com \
    --cc=bp@alien8.de \
    --cc=darwi@linutronix.de \
    --cc=dave.hansen@linux.intel.com \
    --cc=davem@davemloft.net \
    --cc=dyoung@redhat.com \
    --cc=herbert@gondor.apana.org.au \
    --cc=hpa@zytor.com \
    --cc=jackyli@google.com \
    --cc=jacobhxu@google.com \
    --cc=john.allen@amd.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-coco@lists.linux.dev \
    --cc=linux-crypto@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=nikunj@amd.com \
    --cc=pawan.kumar.gupta@linux.intel.com \
    --cc=pbonzini@redhat.com \
    --cc=peterz@infradead.org \
    --cc=pgonda@google.com \
    --cc=rientjes@google.com \
    --cc=seanjc@google.com \
    --cc=tglx@kernel.org \
    --cc=thomas.lendacky@amd.com \
    --cc=x86@kernel.org \
    --cc=xin@zytor.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox