qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Alexey Kardashevskiy <aik@amd.com>
To: "Chenyi Qiang" <chenyi.qiang@intel.com>,
	"David Hildenbrand" <david@redhat.com>,
	"Peter Xu" <peterx@redhat.com>,
	"Paolo Bonzini" <pbonzini@redhat.com>,
	"Philippe Mathieu-Daudé" <philmd@linaro.org>,
	"Michael Roth" <michael.roth@amd.com>
Cc: qemu-devel@nongnu.org, kvm@vger.kernel.org,
	Williams Dan J <dan.j.williams@intel.com>,
	Peng Chao P <chao.p.peng@intel.com>,
	Gao Chao <chao.gao@intel.com>, Xu Yilun <yilun.xu@intel.com>,
	Li Xiaoyao <xiaoyao.li@intel.com>
Subject: Re: [PATCH v2 3/6] memory-attribute-manager: Introduce MemoryAttributeManager to manage RAMBLock with guest_memfd
Date: Wed, 19 Feb 2025 14:49:50 +1100	[thread overview]
Message-ID: <23e2553b-0390-4215-a19d-0422b55efa38@amd.com> (raw)
In-Reply-To: <c5682028-b84c-4b4c-8c4d-f3b43d412e83@intel.com>



On 19/2/25 12:20, Chenyi Qiang wrote:
> 
> 
> On 2/18/2025 5:19 PM, Alexey Kardashevskiy wrote:
>>
>>
> 
> [..]
> 
>>> diff --git a/include/system/memory-attribute-manager.h b/include/
>>> system/memory-attribute-manager.h
>>> new file mode 100644
>>> index 0000000000..72adc0028e
>>> --- /dev/null
>>> +++ b/include/system/memory-attribute-manager.h
>>> @@ -0,0 +1,42 @@
>>> +/*
>>> + * QEMU memory attribute manager
>>> + *
>>> + * Copyright Intel
>>> + *
>>> + * Author:
>>> + *      Chenyi Qiang <chenyi.qiang@intel.com>
>>> + *
>>> + * This work is licensed under the terms of the GNU GPL, version 2 or
>>> later.
>>> + * See the COPYING file in the top-level directory
>>> + *
>>> + */
>>> +
>>> +#ifndef SYSTEM_MEMORY_ATTRIBUTE_MANAGER_H
>>> +#define SYSTEM_MEMORY_ATTRIBUTE_MANAGER_H
>>> +
>>> +#include "system/hostmem.h"
>>> +
>>> +#define TYPE_MEMORY_ATTRIBUTE_MANAGER "memory-attribute-manager"
>>> +
>>> +OBJECT_DECLARE_TYPE(MemoryAttributeManager,
>>> MemoryAttributeManagerClass, MEMORY_ATTRIBUTE_MANAGER)
>>> +
>>> +struct MemoryAttributeManager {
>>> +    Object parent;
>>> +
>>> +    MemoryRegion *mr;
>>> +
>>> +    /* 1-setting of the bit represents the memory is populated
>>> (shared) */
>>> +    int32_t bitmap_size;
>>
>> unsigned.
>>
>> Also, do either s/bitmap_size/shared_bitmap_size/ or
>> s/shared_bitmap/bitmap/
> 
> Will change it. Thanks.
> 
>>
>>
>>
>>> +    unsigned long *shared_bitmap;
>>> +
>>> +    QLIST_HEAD(, RamDiscardListener) rdl_list;
>>> +};
>>> +
>>> +struct MemoryAttributeManagerClass {
>>> +    ObjectClass parent_class;
>>> +};
>>> +
>>> +int memory_attribute_manager_realize(MemoryAttributeManager *mgr,
>>> MemoryRegion *mr);
>>> +void memory_attribute_manager_unrealize(MemoryAttributeManager *mgr);
>>> +
>>> +#endif
>>> diff --git a/system/memory-attribute-manager.c b/system/memory-
>>> attribute-manager.c
>>> new file mode 100644
>>> index 0000000000..ed97e43dd0
>>> --- /dev/null
>>> +++ b/system/memory-attribute-manager.c
>>> @@ -0,0 +1,292 @@
>>> +/*
>>> + * QEMU memory attribute manager
>>> + *
>>> + * Copyright Intel
>>> + *
>>> + * Author:
>>> + *      Chenyi Qiang <chenyi.qiang@intel.com>
>>> + *
>>> + * This work is licensed under the terms of the GNU GPL, version 2 or
>>> later.
>>> + * See the COPYING file in the top-level directory
>>> + *
>>> + */
>>> +
>>> +#include "qemu/osdep.h"
>>> +#include "qemu/error-report.h"
>>> +#include "system/memory-attribute-manager.h"
>>> +
>>> +OBJECT_DEFINE_TYPE_WITH_INTERFACES(MemoryAttributeManager,
>>> +                                   memory_attribute_manager,
>>> +                                   MEMORY_ATTRIBUTE_MANAGER,
>>> +                                   OBJECT,
>>> +                                   { TYPE_RAM_DISCARD_MANAGER },
>>> +                                   { })
>>> +
>>> +static int memory_attribute_manager_get_block_size(const
>>> MemoryAttributeManager *mgr)
>>> +{
>>> +    /*
>>> +     * Because page conversion could be manipulated in the size of at
>>> least 4K or 4K aligned,
>>> +     * Use the host page size as the granularity to track the memory
>>> attribute.
>>> +     * TODO: if necessary, switch to get the page_size from RAMBlock.
>>> +     * i.e. mgr->mr->ram_block->page_size.
>>
>> I'd assume it is rather necessary already.
> 
> OK, Will return the page_size of RAMBlock directly.
> 
>>
>>> +     */
>>> +    return qemu_real_host_page_size();
>>> +}
>>> +
>>> +
>>> +static bool memory_attribute_rdm_is_populated(const RamDiscardManager
>>> *rdm,
>>> +                                              const
>>> MemoryRegionSection *section)
>>> +{
>>> +    const MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm);
>>> +    int block_size = memory_attribute_manager_get_block_size(mgr);
>>> +    uint64_t first_bit = section->offset_within_region / block_size;
>>> +    uint64_t last_bit = first_bit + int128_get64(section->size) /
>>> block_size - 1;
>>> +    unsigned long first_discard_bit;
>>> +
>>> +    first_discard_bit = find_next_zero_bit(mgr->shared_bitmap,
>>> last_bit + 1, first_bit);
>>> +    return first_discard_bit > last_bit;
>>> +}
>>> +
>>> +typedef int (*memory_attribute_section_cb)(MemoryRegionSection *s,
>>> void *arg);
>>> +
>>> +static int memory_attribute_notify_populate_cb(MemoryRegionSection
>>> *section, void *arg)
>>> +{
>>> +    RamDiscardListener *rdl = arg;
>>> +
>>> +    return rdl->notify_populate(rdl, section);
>>> +}
>>> +
>>> +static int memory_attribute_notify_discard_cb(MemoryRegionSection
>>> *section, void *arg)
>>> +{
>>> +    RamDiscardListener *rdl = arg;
>>> +
>>> +    rdl->notify_discard(rdl, section);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int memory_attribute_for_each_populated_section(const
>>> MemoryAttributeManager *mgr,
>>> +
>>> MemoryRegionSection *section,
>>> +                                                       void *arg,
>>> +
>>> memory_attribute_section_cb cb)
>>> +{
>>> +    unsigned long first_one_bit, last_one_bit;
>>> +    uint64_t offset, size;
>>> +    int block_size = memory_attribute_manager_get_block_size(mgr);
>>> +    int ret = 0;
>>> +
>>> +    first_one_bit = section->offset_within_region / block_size;
>>> +    first_one_bit = find_next_bit(mgr->shared_bitmap, mgr-
>>>> bitmap_size, first_one_bit);
>>> +
>>> +    while (first_one_bit < mgr->bitmap_size) {
>>> +        MemoryRegionSection tmp = *section;
>>> +
>>> +        offset = first_one_bit * block_size;
>>> +        last_one_bit = find_next_zero_bit(mgr->shared_bitmap, mgr-
>>>> bitmap_size,
>>> +                                          first_one_bit + 1) - 1;
>>> +        size = (last_one_bit - first_one_bit + 1) * block_size;
>>
>>
>> What all this math is for if we stuck with VFIO doing 1 page at the
>> time? (I think I commented on this)
> 
> Sorry, I missed your previous comment. IMHO, as we track the status in
> bitmap and we want to call the cb() on the shared part within
> MemoryRegionSection. Here we do the calculation to find the expected
> sub-range.


You find a largest intersection here and call cb() on it which will call 
VFIO with 1 page at the time. So you could just call cb() for every page 
from here which will make the code simpler.


>>
>>> +
>>> +        if (!memory_region_section_intersect_range(&tmp, offset,
>>> size)) {
>>> +            break;
>>> +        }
>>> +
>>> +        ret = cb(&tmp, arg);
>>> +        if (ret) {
>>> +            error_report("%s: Failed to notify RAM discard listener:
>>> %s", __func__,
>>> +                         strerror(-ret));
>>> +            break;
>>> +        }
>>> +
>>> +        first_one_bit = find_next_bit(mgr->shared_bitmap, mgr-
>>>> bitmap_size,
>>> +                                      last_one_bit + 2);
>>> +    }
>>> +
>>> +    return ret;
>>> +}
>>> +
> 
> [..]
> 
>>> +
>>> +static void
>>> memory_attribute_rdm_unregister_listener(RamDiscardManager *rdm,
>>> +
>>> RamDiscardListener *rdl)
>>> +{
>>> +    MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm);
>>> +    int ret;
>>> +
>>> +    g_assert(rdl->section);
>>> +    g_assert(rdl->section->mr == mgr->mr);
>>> +
>>> +    ret = memory_attribute_for_each_populated_section(mgr, rdl-
>>>> section, rdl,
>>> +
>>> memory_attribute_notify_discard_cb);
>>> +    if (ret) {
>>> +        error_report("%s: Failed to unregister RAM discard listener:
>>> %s", __func__,
>>> +                     strerror(-ret));
>>> +    }
>>> +
>>> +    memory_region_section_free_copy(rdl->section);
>>> +    rdl->section = NULL;
>>> +    QLIST_REMOVE(rdl, next);
>>> +
>>> +}
>>> +
>>> +typedef struct MemoryAttributeReplayData {
>>> +    void *fn;
>>
>> ReplayRamDiscard *fn, not void*.
> 
> We could cast the void *fn either to ReplayRamPopulate or
> ReplayRamDiscard (see below).


Hard to read, hard to maintain, and they take same parameters, only the 
return value is different (int/void) - if this is really important, have 
2 fn pointers in MemoryAttributeReplayData. It is already hard to follow 
this train on callbacks.


>>> +    void *opaque;
>>> +} MemoryAttributeReplayData;
>>> +
>>> +static int
>>> memory_attribute_rdm_replay_populated_cb(MemoryRegionSection *section,
>>> void *arg)
>>> +{
>>> +    MemoryAttributeReplayData *data = arg;
>>> +
>>> +    return ((ReplayRamPopulate)data->fn)(section, data->opaque);
>>> +}
>>> +
>>> +static int memory_attribute_rdm_replay_populated(const
>>> RamDiscardManager *rdm,
>>> +                                                 MemoryRegionSection
>>> *section,
>>> +                                                 ReplayRamPopulate
>>> replay_fn,
>>> +                                                 void *opaque)
>>> +{
>>> +    MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm);
>>> +    MemoryAttributeReplayData data = { .fn = replay_fn, .opaque =
>>> opaque };
>>> +
>>> +    g_assert(section->mr == mgr->mr);
>>> +    return memory_attribute_for_each_populated_section(mgr, section,
>>> &data,
>>> +
>>> memory_attribute_rdm_replay_populated_cb);
>>> +}
>>> +
>>> +static int
>>> memory_attribute_rdm_replay_discarded_cb(MemoryRegionSection *section,
>>> void *arg)
>>> +{
>>> +    MemoryAttributeReplayData *data = arg;
>>> +
>>> +    ((ReplayRamDiscard)data->fn)(section, data->opaque);
>>> +    return 0;
>>> +}
>>> +
>>> +static void memory_attribute_rdm_replay_discarded(const
>>> RamDiscardManager *rdm,
>>> +                                                  MemoryRegionSection
>>> *section,
>>> +                                                  ReplayRamDiscard
>>> replay_fn,
>>> +                                                  void *opaque)
>>> +{
>>> +    MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm);
>>> +    MemoryAttributeReplayData data = { .fn = replay_fn, .opaque =
>>> opaque };
>>> +
>>> +    g_assert(section->mr == mgr->mr);
>>> +    memory_attribute_for_each_discarded_section(mgr, section, &data,
>>> +
>>> memory_attribute_rdm_replay_discarded_cb);
>>> +}
>>> +
>>> +int memory_attribute_manager_realize(MemoryAttributeManager *mgr,
>>> MemoryRegion *mr)
>>> +{
>>> +    uint64_t bitmap_size;
>>> +    int block_size = memory_attribute_manager_get_block_size(mgr);
>>> +    int ret;
>>> +
>>> +    bitmap_size = ROUND_UP(mr->size, block_size) / block_size;
>>> +
>>> +    mgr->mr = mr;
>>> +    mgr->bitmap_size = bitmap_size;
>>> +    mgr->shared_bitmap = bitmap_new(bitmap_size);
>>> +
>>> +    ret = memory_region_set_ram_discard_manager(mgr->mr,
>>> RAM_DISCARD_MANAGER(mgr));
>>
>> Move it 3 lines up and avoid stale data in mgr->mr/bitmap_size/
>> shared_bitmap and avoid g_free below?
> 
> Make sense. I will move it up the same as patch 02 before bitmap_new().
> 
>>
>>> +    if (ret) {
>>> +        g_free(mgr->shared_bitmap);
>>> +    }
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +void memory_attribute_manager_unrealize(MemoryAttributeManager *mgr)
>>> +{
>>> +    memory_region_set_ram_discard_manager(mgr->mr, NULL);
>>> +
>>> +    g_free(mgr->shared_bitmap);
>>> +}
>>> +
>>> +static void memory_attribute_manager_init(Object *obj)
>>
>> Not used.
>>
>>> +{
>>> +    MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(obj);
>>> +
>>> +    QLIST_INIT(&mgr->rdl_list);
>>> +} > +
>>> +static void memory_attribute_manager_finalize(Object *obj)
>>
>> Not used either. Thanks,
> 
> I think it is OK to define it as a placeholder? Just some preference.

At very least gcc should warn on these (I am surprised it did not) and 
nobody likes this. Thanks,


>>
>>> +{
>>> +}
>>> +
>>> +static void memory_attribute_manager_class_init(ObjectClass *oc, void
>>> *data)
>>> +{
>>> +    RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(oc);
>>> +
>>> +    rdmc->get_min_granularity =
>>> memory_attribute_rdm_get_min_granularity;
>>> +    rdmc->register_listener = memory_attribute_rdm_register_listener;
>>> +    rdmc->unregister_listener =
>>> memory_attribute_rdm_unregister_listener;
>>> +    rdmc->is_populated = memory_attribute_rdm_is_populated;
>>> +    rdmc->replay_populated = memory_attribute_rdm_replay_populated;
>>> +    rdmc->replay_discarded = memory_attribute_rdm_replay_discarded;
>>> +}
>>> diff --git a/system/meson.build b/system/meson.build
>>> index 4952f4b2c7..ab07ff1442 100644
>>> --- a/system/meson.build
>>> +++ b/system/meson.build
>>> @@ -15,6 +15,7 @@ system_ss.add(files(
>>>      'dirtylimit.c',
>>>      'dma-helpers.c',
>>>      'globals.c',
>>> +  'memory-attribute-manager.c',
>>>      'memory_mapping.c',
>>>      'qdev-monitor.c',
>>>      'qtest.c',
>>
> 

-- 
Alexey



  reply	other threads:[~2025-02-19  3:51 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-17  8:18 [PATCH v2 0/6] Enable shared device assignment Chenyi Qiang
2025-02-17  8:18 ` [PATCH v2 1/6] memory: Export a helper to get intersection of a MemoryRegionSection with a given range Chenyi Qiang
2025-02-17  8:18 ` [PATCH v2 2/6] memory: Change memory_region_set_ram_discard_manager() to return the result Chenyi Qiang
2025-02-18  9:19   ` Alexey Kardashevskiy
2025-02-18  9:41     ` Chenyi Qiang
2025-02-18 10:46     ` David Hildenbrand
2025-02-17  8:18 ` [PATCH v2 3/6] memory-attribute-manager: Introduce MemoryAttributeManager to manage RAMBLock with guest_memfd Chenyi Qiang
2025-02-18  9:19   ` Alexey Kardashevskiy
2025-02-19  1:20     ` Chenyi Qiang
2025-02-19  3:49       ` Alexey Kardashevskiy [this message]
2025-02-19  6:33         ` Chenyi Qiang
2025-02-20  3:02           ` Alexey Kardashevskiy
2025-02-17  8:18 ` [PATCH v2 4/6] memory-attribute-manager: Introduce a callback to notify the shared/private state change Chenyi Qiang
2025-02-18  9:19   ` Alexey Kardashevskiy
2025-02-19  1:50     ` Chenyi Qiang
2025-02-17  8:18 ` [PATCH v2 5/6] memory: Attach MemoryAttributeManager to guest_memfd-backed RAMBlocks Chenyi Qiang
2025-02-18  9:19   ` Alexey Kardashevskiy
2025-02-17  8:18 ` [PATCH v2 6/6] RAMBlock: Make guest_memfd require coordinate discard Chenyi Qiang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=23e2553b-0390-4215-a19d-0422b55efa38@amd.com \
    --to=aik@amd.com \
    --cc=chao.gao@intel.com \
    --cc=chao.p.peng@intel.com \
    --cc=chenyi.qiang@intel.com \
    --cc=dan.j.williams@intel.com \
    --cc=david@redhat.com \
    --cc=kvm@vger.kernel.org \
    --cc=michael.roth@amd.com \
    --cc=pbonzini@redhat.com \
    --cc=peterx@redhat.com \
    --cc=philmd@linaro.org \
    --cc=qemu-devel@nongnu.org \
    --cc=xiaoyao.li@intel.com \
    --cc=yilun.xu@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).