qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Alexey Kardashevskiy <aik@amd.com>
To: "Chenyi Qiang" <chenyi.qiang@intel.com>,
	"David Hildenbrand" <david@redhat.com>,
	"Peter Xu" <peterx@redhat.com>,
	"Paolo Bonzini" <pbonzini@redhat.com>,
	"Philippe Mathieu-Daudé" <philmd@linaro.org>,
	"Michael Roth" <michael.roth@amd.com>
Cc: qemu-devel@nongnu.org, kvm@vger.kernel.org,
	Williams Dan J <dan.j.williams@intel.com>,
	Peng Chao P <chao.p.peng@intel.com>,
	Gao Chao <chao.gao@intel.com>, Xu Yilun <yilun.xu@intel.com>,
	Li Xiaoyao <xiaoyao.li@intel.com>
Subject: Re: [PATCH v2 3/6] memory-attribute-manager: Introduce MemoryAttributeManager to manage RAMBLock with guest_memfd
Date: Thu, 20 Feb 2025 14:02:56 +1100	[thread overview]
Message-ID: <89e791a5-b71b-4b9d-a8b4-e225bfbd1bc2@amd.com> (raw)
In-Reply-To: <d410d033-dd1c-43fe-85df-1bdaecf250fd@intel.com>



On 19/2/25 17:33, Chenyi Qiang wrote:
> 
> 
> On 2/19/2025 11:49 AM, Alexey Kardashevskiy wrote:
>>
>>
>> On 19/2/25 12:20, Chenyi Qiang wrote:
>>>
>>>
>>> On 2/18/2025 5:19 PM, Alexey Kardashevskiy wrote:
>>>>
>>>>
>>>
>>> [..]
>>>
>>>>> diff --git a/include/system/memory-attribute-manager.h b/include/
>>>>> system/memory-attribute-manager.h
>>>>> new file mode 100644
>>>>> index 0000000000..72adc0028e
>>>>> --- /dev/null
>>>>> +++ b/include/system/memory-attribute-manager.h
>>>>> @@ -0,0 +1,42 @@
>>>>> +/*
>>>>> + * QEMU memory attribute manager
>>>>> + *
>>>>> + * Copyright Intel
>>>>> + *
>>>>> + * Author:
>>>>> + *      Chenyi Qiang <chenyi.qiang@intel.com>
>>>>> + *
>>>>> + * This work is licensed under the terms of the GNU GPL, version 2 or
>>>>> later.
>>>>> + * See the COPYING file in the top-level directory
>>>>> + *
>>>>> + */
>>>>> +
>>>>> +#ifndef SYSTEM_MEMORY_ATTRIBUTE_MANAGER_H
>>>>> +#define SYSTEM_MEMORY_ATTRIBUTE_MANAGER_H
>>>>> +
>>>>> +#include "system/hostmem.h"
>>>>> +
>>>>> +#define TYPE_MEMORY_ATTRIBUTE_MANAGER "memory-attribute-manager"
>>>>> +
>>>>> +OBJECT_DECLARE_TYPE(MemoryAttributeManager,
>>>>> MemoryAttributeManagerClass, MEMORY_ATTRIBUTE_MANAGER)
>>>>> +
>>>>> +struct MemoryAttributeManager {
>>>>> +    Object parent;
>>>>> +
>>>>> +    MemoryRegion *mr;
>>>>> +
>>>>> +    /* 1-setting of the bit represents the memory is populated
>>>>> (shared) */
>>>>> +    int32_t bitmap_size;
>>>>
>>>> unsigned.
>>>>
>>>> Also, do either s/bitmap_size/shared_bitmap_size/ or
>>>> s/shared_bitmap/bitmap/
>>>
>>> Will change it. Thanks.
>>>
>>>>
>>>>
>>>>
>>>>> +    unsigned long *shared_bitmap;
>>>>> +
>>>>> +    QLIST_HEAD(, RamDiscardListener) rdl_list;
>>>>> +};
>>>>> +
>>>>> +struct MemoryAttributeManagerClass {
>>>>> +    ObjectClass parent_class;
>>>>> +};
>>>>> +
>>>>> +int memory_attribute_manager_realize(MemoryAttributeManager *mgr,
>>>>> MemoryRegion *mr);
>>>>> +void memory_attribute_manager_unrealize(MemoryAttributeManager *mgr);
>>>>> +
>>>>> +#endif
>>>>> diff --git a/system/memory-attribute-manager.c b/system/memory-
>>>>> attribute-manager.c
>>>>> new file mode 100644
>>>>> index 0000000000..ed97e43dd0
>>>>> --- /dev/null
>>>>> +++ b/system/memory-attribute-manager.c
>>>>> @@ -0,0 +1,292 @@
>>>>> +/*
>>>>> + * QEMU memory attribute manager
>>>>> + *
>>>>> + * Copyright Intel
>>>>> + *
>>>>> + * Author:
>>>>> + *      Chenyi Qiang <chenyi.qiang@intel.com>
>>>>> + *
>>>>> + * This work is licensed under the terms of the GNU GPL, version 2 or
>>>>> later.
>>>>> + * See the COPYING file in the top-level directory
>>>>> + *
>>>>> + */
>>>>> +
>>>>> +#include "qemu/osdep.h"
>>>>> +#include "qemu/error-report.h"
>>>>> +#include "system/memory-attribute-manager.h"
>>>>> +
>>>>> +OBJECT_DEFINE_TYPE_WITH_INTERFACES(MemoryAttributeManager,
>>>>> +                                   memory_attribute_manager,
>>>>> +                                   MEMORY_ATTRIBUTE_MANAGER,
>>>>> +                                   OBJECT,
>>>>> +                                   { TYPE_RAM_DISCARD_MANAGER },
>>>>> +                                   { })
>>>>> +
>>>>> +static int memory_attribute_manager_get_block_size(const
>>>>> MemoryAttributeManager *mgr)
>>>>> +{
>>>>> +    /*
>>>>> +     * Because page conversion could be manipulated in the size of at
>>>>> least 4K or 4K aligned,
>>>>> +     * Use the host page size as the granularity to track the memory
>>>>> attribute.
>>>>> +     * TODO: if necessary, switch to get the page_size from RAMBlock.
>>>>> +     * i.e. mgr->mr->ram_block->page_size.
>>>>
>>>> I'd assume it is rather necessary already.
>>>
>>> OK, Will return the page_size of RAMBlock directly.
>>>
>>>>
>>>>> +     */
>>>>> +    return qemu_real_host_page_size();
>>>>> +}
>>>>> +
>>>>> +
>>>>> +static bool memory_attribute_rdm_is_populated(const RamDiscardManager
>>>>> *rdm,
>>>>> +                                              const
>>>>> MemoryRegionSection *section)
>>>>> +{
>>>>> +    const MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm);
>>>>> +    int block_size = memory_attribute_manager_get_block_size(mgr);
>>>>> +    uint64_t first_bit = section->offset_within_region / block_size;
>>>>> +    uint64_t last_bit = first_bit + int128_get64(section->size) /
>>>>> block_size - 1;
>>>>> +    unsigned long first_discard_bit;
>>>>> +
>>>>> +    first_discard_bit = find_next_zero_bit(mgr->shared_bitmap,
>>>>> last_bit + 1, first_bit);
>>>>> +    return first_discard_bit > last_bit;
>>>>> +}
>>>>> +
>>>>> +typedef int (*memory_attribute_section_cb)(MemoryRegionSection *s,
>>>>> void *arg);
>>>>> +
>>>>> +static int memory_attribute_notify_populate_cb(MemoryRegionSection
>>>>> *section, void *arg)
>>>>> +{
>>>>> +    RamDiscardListener *rdl = arg;
>>>>> +
>>>>> +    return rdl->notify_populate(rdl, section);
>>>>> +}
>>>>> +
>>>>> +static int memory_attribute_notify_discard_cb(MemoryRegionSection
>>>>> *section, void *arg)
>>>>> +{
>>>>> +    RamDiscardListener *rdl = arg;
>>>>> +
>>>>> +    rdl->notify_discard(rdl, section);
>>>>> +
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>> +static int memory_attribute_for_each_populated_section(const
>>>>> MemoryAttributeManager *mgr,
>>>>> +
>>>>> MemoryRegionSection *section,
>>>>> +                                                       void *arg,
>>>>> +
>>>>> memory_attribute_section_cb cb)
>>>>> +{
>>>>> +    unsigned long first_one_bit, last_one_bit;


btw s/first_one_bit/first/  and  s/last_one_bit/last/  as it is quite 
obvious from the code what these are.


>>>>> +    uint64_t offset, size;
>>>>> +    int block_size = memory_attribute_manager_get_block_size(mgr);
>>>>> +    int ret = 0;
>>>>> +
>>>>> +    first_one_bit = section->offset_within_region / block_size;
>>>>> +    first_one_bit = find_next_bit(mgr->shared_bitmap, mgr-
>>>>>> bitmap_size, first_one_bit);
>>>>> +
>>>>> +    while (first_one_bit < mgr->bitmap_size) {
>>>>> +        MemoryRegionSection tmp = *section;
>>>>> +
>>>>> +        offset = first_one_bit * block_size;
>>>>> +        last_one_bit = find_next_zero_bit(mgr->shared_bitmap, mgr-
>>>>>> bitmap_size,
>>>>> +                                          first_one_bit + 1) - 1;
>>>>> +        size = (last_one_bit - first_one_bit + 1) * block_size;
>>>>
>>>>
>>>> What all this math is for if we stuck with VFIO doing 1 page at the
>>>> time? (I think I commented on this)
>>>
>>> Sorry, I missed your previous comment. IMHO, as we track the status in
>>> bitmap and we want to call the cb() on the shared part within
>>> MemoryRegionSection. Here we do the calculation to find the expected
>>> sub-range.
>>
>>
>> You find a largest intersection here and call cb() on it which will call
>> VFIO with 1 page at the time. So you could just call cb() for every page
>> from here which will make the code simpler.
> 
> I prefer to keep calling cb() on a large intersection . I think in
> future after cut_mapping is supported, we don't need to make VFIO call 1
> page at a time. VFIO can call on the large range directly.

> In addition, calling cb() for every page seems specific to VFIO usage.
> It is more generic to call on a large intersection. If more RDM listener
> added in future(although VFIO is the only user currently), do the split
> in caller is inefficient.

It is an hardly measurable optimization though. Could be a separate 
patch. I do not insist, just do not see the point, If others are fine, I 
am fine too :)

> 
>>
>>
>>>>
>>>>> +
>>>>> +        if (!memory_region_section_intersect_range(&tmp, offset,
>>>>> size)) {
>>>>> +            break;
>>>>> +        }
>>>>> +
>>>>> +        ret = cb(&tmp, arg);
>>>>> +        if (ret) {
>>>>> +            error_report("%s: Failed to notify RAM discard listener:
>>>>> %s", __func__,
>>>>> +                         strerror(-ret));
>>>>> +            break;
>>>>> +        }
>>>>> +
>>>>> +        first_one_bit = find_next_bit(mgr->shared_bitmap, mgr-
>>>>>> bitmap_size,
>>>>> +                                      last_one_bit + 2);
>>>>> +    }
>>>>> +
>>>>> +    return ret;
>>>>> +}
>>>>> +
>>>
>>> [..]
>>>
>>>>> +
>>>>> +static void
>>>>> memory_attribute_rdm_unregister_listener(RamDiscardManager *rdm,
>>>>> +
>>>>> RamDiscardListener *rdl)
>>>>> +{
>>>>> +    MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm);
>>>>> +    int ret;
>>>>> +
>>>>> +    g_assert(rdl->section);
>>>>> +    g_assert(rdl->section->mr == mgr->mr);
>>>>> +
>>>>> +    ret = memory_attribute_for_each_populated_section(mgr, rdl-
>>>>>> section, rdl,
>>>>> +
>>>>> memory_attribute_notify_discard_cb);
>>>>> +    if (ret) {
>>>>> +        error_report("%s: Failed to unregister RAM discard listener:
>>>>> %s", __func__,
>>>>> +                     strerror(-ret));
>>>>> +    }
>>>>> +
>>>>> +    memory_region_section_free_copy(rdl->section);
>>>>> +    rdl->section = NULL;
>>>>> +    QLIST_REMOVE(rdl, next);
>>>>> +
>>>>> +}
>>>>> +
>>>>> +typedef struct MemoryAttributeReplayData {
>>>>> +    void *fn;
>>>>
>>>> ReplayRamDiscard *fn, not void*.
>>>
>>> We could cast the void *fn either to ReplayRamPopulate or
>>> ReplayRamDiscard (see below).
>>
>>
>> Hard to read, hard to maintain, and they take same parameters, only the
>> return value is different (int/void) - if this is really important, have
>> 2 fn pointers in MemoryAttributeReplayData. It is already hard to follow
>> this train on callbacks.
> 
> Actually, I prefer to make ReplayRamDiscard and ReplayRamPopulate
> unified. Make ReplayRamDiscard() also return int. Then we only need to
> define one function like:
> 
> typedef int (*ReplayMemoryAttributeChange)(MemoryRegionSection *section,
> void *opaque);

This should work.


> 
> Maybe David can share his opinions.
>>
>>
>>>>> +    void *opaque;
>>>>> +} MemoryAttributeReplayData;
>>>>> +
>>>>> +static int
>>>>> memory_attribute_rdm_replay_populated_cb(MemoryRegionSection *section,
>>>>> void *arg)
>>>>> +{
>>>>> +    MemoryAttributeReplayData *data = arg;
>>>>> +
>>>>> +    return ((ReplayRamPopulate)data->fn)(section, data->opaque);
>>>>> +}
>>>>> +
>>>>> +static int memory_attribute_rdm_replay_populated(const
>>>>> RamDiscardManager *rdm,
>>>>> +                                                 MemoryRegionSection
>>>>> *section,
>>>>> +                                                 ReplayRamPopulate
>>>>> replay_fn,
>>>>> +                                                 void *opaque)
>>>>> +{
>>>>> +    MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm);
>>>>> +    MemoryAttributeReplayData data = { .fn = replay_fn, .opaque =
>>>>> opaque };
>>>>> +
>>>>> +    g_assert(section->mr == mgr->mr);
>>>>> +    return memory_attribute_for_each_populated_section(mgr, section,
>>>>> &data,
>>>>> +
>>>>> memory_attribute_rdm_replay_populated_cb);
>>>>> +}
>>>>> +
>>>>> +static int
>>>>> memory_attribute_rdm_replay_discarded_cb(MemoryRegionSection *section,
>>>>> void *arg)
>>>>> +{
>>>>> +    MemoryAttributeReplayData *data = arg;
>>>>> +
>>>>> +    ((ReplayRamDiscard)data->fn)(section, data->opaque);
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>> +static void memory_attribute_rdm_replay_discarded(const
>>>>> RamDiscardManager *rdm,
>>>>> +                                                  MemoryRegionSection
>>>>> *section,
>>>>> +                                                  ReplayRamDiscard
>>>>> replay_fn,
>>>>> +                                                  void *opaque)
>>>>> +{
>>>>> +    MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm);
>>>>> +    MemoryAttributeReplayData data = { .fn = replay_fn, .opaque =
>>>>> opaque };
>>>>> +
>>>>> +    g_assert(section->mr == mgr->mr);
>>>>> +    memory_attribute_for_each_discarded_section(mgr, section, &data,
>>>>> +
>>>>> memory_attribute_rdm_replay_discarded_cb);
>>>>> +}
>>>>> +
>>>>> +int memory_attribute_manager_realize(MemoryAttributeManager *mgr,
>>>>> MemoryRegion *mr)
>>>>> +{
>>>>> +    uint64_t bitmap_size;
>>>>> +    int block_size = memory_attribute_manager_get_block_size(mgr);
>>>>> +    int ret;
>>>>> +
>>>>> +    bitmap_size = ROUND_UP(mr->size, block_size) / block_size;
>>>>> +
>>>>> +    mgr->mr = mr;
>>>>> +    mgr->bitmap_size = bitmap_size;
>>>>> +    mgr->shared_bitmap = bitmap_new(bitmap_size);
>>>>> +
>>>>> +    ret = memory_region_set_ram_discard_manager(mgr->mr,
>>>>> RAM_DISCARD_MANAGER(mgr));
>>>>
>>>> Move it 3 lines up and avoid stale data in mgr->mr/bitmap_size/
>>>> shared_bitmap and avoid g_free below?
>>>
>>> Make sense. I will move it up the same as patch 02 before bitmap_new().
>>>
>>>>
>>>>> +    if (ret) {
>>>>> +        g_free(mgr->shared_bitmap);
>>>>> +    }
>>>>> +
>>>>> +    return ret;
>>>>> +}
>>>>> +
>>>>> +void memory_attribute_manager_unrealize(MemoryAttributeManager *mgr)
>>>>> +{
>>>>> +    memory_region_set_ram_discard_manager(mgr->mr, NULL);
>>>>> +
>>>>> +    g_free(mgr->shared_bitmap);
>>>>> +}
>>>>> +
>>>>> +static void memory_attribute_manager_init(Object *obj)
>>>>
>>>> Not used.
>>>>
>>>>> +{
>>>>> +    MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(obj);
>>>>> +
>>>>> +    QLIST_INIT(&mgr->rdl_list);
>>>>> +} > +
>>>>> +static void memory_attribute_manager_finalize(Object *obj)
>>>>
>>>> Not used either. Thanks,
>>>
>>> I think it is OK to define it as a placeholder? Just some preference.
>>
>> At very least gcc should warn on these (I am surprised it did not) and
>> nobody likes this. Thanks,
> 
> I tried a little. They must be defined. The init() and finalize() calls
> are used in the OBJECT_DEFINE_TYPE_WITH_INTERFACES() macro. I think it
> is a common template to define in this way.

ah, I missed that. OBJECT_DEFINE_TYPE_WITH_INTERFACES means they have to 
be defined, never mind that. Thanks,


>>
>>
>>>>
>>>>> +{
>>>>> +}
>>>>> +
>>>>> +static void memory_attribute_manager_class_init(ObjectClass *oc, void
>>>>> *data)
>>>>> +{
>>>>> +    RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(oc);
>>>>> +
>>>>> +    rdmc->get_min_granularity =
>>>>> memory_attribute_rdm_get_min_granularity;
>>>>> +    rdmc->register_listener = memory_attribute_rdm_register_listener;
>>>>> +    rdmc->unregister_listener =
>>>>> memory_attribute_rdm_unregister_listener;
>>>>> +    rdmc->is_populated = memory_attribute_rdm_is_populated;
>>>>> +    rdmc->replay_populated = memory_attribute_rdm_replay_populated;
>>>>> +    rdmc->replay_discarded = memory_attribute_rdm_replay_discarded;
>>>>> +}
>>>>> diff --git a/system/meson.build b/system/meson.build
>>>>> index 4952f4b2c7..ab07ff1442 100644
>>>>> --- a/system/meson.build
>>>>> +++ b/system/meson.build
>>>>> @@ -15,6 +15,7 @@ system_ss.add(files(
>>>>>       'dirtylimit.c',
>>>>>       'dma-helpers.c',
>>>>>       'globals.c',
>>>>> +  'memory-attribute-manager.c',
>>>>>       'memory_mapping.c',
>>>>>       'qdev-monitor.c',
>>>>>       'qtest.c',
>>>>
>>>
>>
> 

-- 
Alexey



  reply	other threads:[~2025-02-20  3:09 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-17  8:18 [PATCH v2 0/6] Enable shared device assignment Chenyi Qiang
2025-02-17  8:18 ` [PATCH v2 1/6] memory: Export a helper to get intersection of a MemoryRegionSection with a given range Chenyi Qiang
2025-02-17  8:18 ` [PATCH v2 2/6] memory: Change memory_region_set_ram_discard_manager() to return the result Chenyi Qiang
2025-02-18  9:19   ` Alexey Kardashevskiy
2025-02-18  9:41     ` Chenyi Qiang
2025-02-18 10:46     ` David Hildenbrand
2025-02-17  8:18 ` [PATCH v2 3/6] memory-attribute-manager: Introduce MemoryAttributeManager to manage RAMBLock with guest_memfd Chenyi Qiang
2025-02-18  9:19   ` Alexey Kardashevskiy
2025-02-19  1:20     ` Chenyi Qiang
2025-02-19  3:49       ` Alexey Kardashevskiy
2025-02-19  6:33         ` Chenyi Qiang
2025-02-20  3:02           ` Alexey Kardashevskiy [this message]
2025-02-17  8:18 ` [PATCH v2 4/6] memory-attribute-manager: Introduce a callback to notify the shared/private state change Chenyi Qiang
2025-02-18  9:19   ` Alexey Kardashevskiy
2025-02-19  1:50     ` Chenyi Qiang
2025-02-17  8:18 ` [PATCH v2 5/6] memory: Attach MemoryAttributeManager to guest_memfd-backed RAMBlocks Chenyi Qiang
2025-02-18  9:19   ` Alexey Kardashevskiy
2025-02-17  8:18 ` [PATCH v2 6/6] RAMBlock: Make guest_memfd require coordinate discard Chenyi Qiang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=89e791a5-b71b-4b9d-a8b4-e225bfbd1bc2@amd.com \
    --to=aik@amd.com \
    --cc=chao.gao@intel.com \
    --cc=chao.p.peng@intel.com \
    --cc=chenyi.qiang@intel.com \
    --cc=dan.j.williams@intel.com \
    --cc=david@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=michael.roth@amd.com \
    --cc=pbonzini@redhat.com \
    --cc=peterx@redhat.com \
    --cc=philmd@linaro.org \
    --cc=qemu-devel@nongnu.org \
    --cc=xiaoyao.li@intel.com \
    --cc=yilun.xu@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).