From: Paolo Bonzini <pbonzini@redhat.com>
To: Liu Ping Fan <qemulist@gmail.com>
Cc: Peter Maydell <peter.maydell@linaro.org>,
Anthony Liguori <anthony@codemonkey.ws>,
"Michael S. Tsirkin" <mst@redhat.com>,
Jan Kiszka <jan.kiszka@siemens.com>,
qemu-devel@nongnu.org, Stefan Hajnoczi <stefanha@redhat.com>
Subject: Re: [Qemu-devel] [RFC PATCH 1/2] mem: make phys_section and phys_map_nodes prepared for RCU
Date: Mon, 13 May 2013 11:20:59 +0200 [thread overview]
Message-ID: <5190B07B.3040605@redhat.com> (raw)
In-Reply-To: <1368415264-10800-2-git-send-email-qemulist@gmail.com>
Il 13/05/2013 05:21, Liu Ping Fan ha scritto:
> From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>
> Now, each AddressSpaceDispatch has its own radix-tree, and all of them
> lie on phys_section[] and phys_map_nodes[]. When we want lockless
> mmio dispatch, we need something like RCU.
>
> Acheive this with PhysPageTable which contains all of the info for all
> radix trees. After all address space listeners update (ie. excluding the
> readers) we switch from PhysPageTable *cur_pgtbl to *next_pgtbl.
> (The real RCU style is adopt by listener, see next patch)
>
> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
> ---
> exec.c | 197 +++++++++++++++++++++++++------------------------
> include/exec/memory.h | 2 +
> memory.c | 2 +
> 3 files changed, 106 insertions(+), 95 deletions(-)
>
> diff --git a/exec.c b/exec.c
> index c5f8082..bb4e540 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -80,23 +80,34 @@ int use_icount;
> #if !defined(CONFIG_USER_ONLY)
>
> #define SUBSECTION_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
> -#define PHYS_SECTION_ID(psection) ((psection) - phys_sections)
> +#define PHYS_SECTION_ID(psection, base) ((psection) - base->phys_sections)
>
> typedef struct PhysSection {
> MemoryRegionSection section;
> uint16_t *sub_section;
> } PhysSection;
>
> -static PhysSection *phys_sections;
> -static unsigned phys_sections_nb, phys_sections_nb_alloc;
> -static uint16_t phys_section_unassigned;
> -static uint16_t phys_section_notdirty;
> -static uint16_t phys_section_rom;
> -static uint16_t phys_section_watch;
> +typedef PhysPageEntry Node[L2_SIZE];
>
> -/* Simple allocator for PhysPageEntry nodes */
> -static PhysPageEntry (*phys_map_nodes)[L2_SIZE];
> -static unsigned phys_map_nodes_nb, phys_map_nodes_nb_alloc;
> +typedef struct PhysPageTable PhysPageTable;
> +
> +struct PhysPageTable {
> + int ref;
> + PhysSection *phys_sections;
> + unsigned phys_sections_nb;
> + unsigned phys_sections_nb_alloc;
> + uint16_t phys_section_unassigned;
> + uint16_t phys_section_notdirty;
> + uint16_t phys_section_rom;
> + uint16_t phys_section_watch;
These four should be constants. Please make them #defines and add
assertions that they have the expected values (in a separate patch).
> +
> + Node *phys_map_nodes;
> + unsigned phys_map_nodes_nb;
> + unsigned phys_map_nodes_nb_alloc;
> +};
> +static PhysPageTable *cur_pgtbl;
> +static PhysPageTable *next_pgtbl;
You shouldn't need cur_pgtbl. Instead, each AddressSpaceDispatch should
have a pointer to its own cur_pgtbl. In the commit hook you can then
take a lock, unref the old table, assign cur_pgtbl = next_pgtbl and ref
the new one.
The lock must also be taken in address_space_translate, together with a
relatively expensive ref/unref pair (two atomic operations). We
probably need real RCU so that we can skip the ref/unref.
>
> #define PHYS_MAP_NODE_NIL (((uint16_t)~0) >> 1)
>
> @@ -111,13 +122,13 @@ static MemoryRegion io_mem_watch;
>
> static void phys_map_node_reserve(unsigned nodes)
> {
> - if (phys_map_nodes_nb + nodes > phys_map_nodes_nb_alloc) {
> + if (next_pgtbl->phys_map_nodes_nb + nodes > next_pgtbl->phys_map_nodes_nb_alloc) {
> typedef PhysPageEntry Node[L2_SIZE];
> - phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc * 2, 16);
> - phys_map_nodes_nb_alloc = MAX(phys_map_nodes_nb_alloc,
> - phys_map_nodes_nb + nodes);
> - phys_map_nodes = g_renew(Node, phys_map_nodes,
> - phys_map_nodes_nb_alloc);
> + next_pgtbl->phys_map_nodes_nb_alloc = MAX(next_pgtbl->phys_map_nodes_nb_alloc * 2, 16);
> + next_pgtbl->phys_map_nodes_nb_alloc = MAX(next_pgtbl->phys_map_nodes_nb_alloc,
> + next_pgtbl->phys_map_nodes_nb + nodes);
> + next_pgtbl->phys_map_nodes = g_renew(Node, next_pgtbl->phys_map_nodes,
> + next_pgtbl->phys_map_nodes_nb_alloc);
> }
> }
>
> @@ -126,22 +137,16 @@ static uint16_t phys_map_node_alloc(void)
> unsigned i;
> uint16_t ret;
>
> - ret = phys_map_nodes_nb++;
> + ret = next_pgtbl->phys_map_nodes_nb++;
> assert(ret != PHYS_MAP_NODE_NIL);
> - assert(ret != phys_map_nodes_nb_alloc);
> + assert(ret != next_pgtbl->phys_map_nodes_nb_alloc);
> for (i = 0; i < L2_SIZE; ++i) {
> - phys_map_nodes[ret][i].is_leaf = 0;
> - phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
> + next_pgtbl->phys_map_nodes[ret][i].is_leaf = 0;
> + next_pgtbl->phys_map_nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
> }
> return ret;
> }
>
> -static void phys_map_nodes_reset(void)
> -{
> - phys_map_nodes_nb = 0;
> -}
> -
> -
> static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
> hwaddr *nb, uint16_t leaf,
> int level)
> @@ -152,15 +157,15 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
>
> if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) {
> lp->ptr = phys_map_node_alloc();
> - p = phys_map_nodes[lp->ptr];
> + p = next_pgtbl->phys_map_nodes[lp->ptr];
> if (level == 0) {
> for (i = 0; i < L2_SIZE; i++) {
> p[i].is_leaf = 1;
> - p[i].ptr = phys_section_unassigned;
> + p[i].ptr = next_pgtbl->phys_section_unassigned;
> }
> }
> } else {
> - p = phys_map_nodes[lp->ptr];
> + p = next_pgtbl->phys_map_nodes[lp->ptr];
> }
> lp = &p[(*index >> (level * L2_BITS)) & (L2_SIZE - 1)];
>
> @@ -192,11 +197,13 @@ static PhysSection *phys_section_find(AddressSpaceDispatch *d,
> {
> PhysPageEntry lp = d->phys_map;
> PhysPageEntry *p;
> + PhysSection *phys_sections = cur_pgtbl->phys_sections;
> + Node *phys_map_nodes = cur_pgtbl->phys_map_nodes;
> int i;
>
> for (i = P_L2_LEVELS - 1; i >= 0 && !lp.is_leaf; i--) {
> if (lp.ptr == PHYS_MAP_NODE_NIL) {
> - return &phys_sections[phys_section_unassigned];
> + return &phys_sections[cur_pgtbl->phys_section_unassigned];
> }
> p = phys_map_nodes[lp.ptr];
> lp = p[(index >> (i * L2_BITS)) & (L2_SIZE - 1)];
> @@ -209,6 +216,7 @@ static MemoryRegionSection *address_space_lookup_region(AddressSpace *as,
> {
> PhysSection *psection;
> uint16_t idx;
> + PhysSection *phys_sections = cur_pgtbl->phys_sections;
>
> psection = phys_section_find(as->dispatch, addr >> TARGET_PAGE_BITS);
> if (psection->sub_section) {
> @@ -246,7 +254,7 @@ MemoryRegionSection *address_space_translate(AddressSpace *as, hwaddr addr,
> | (addr & iotlb.addr_mask));
> len = MIN(len, (addr | iotlb.addr_mask) - addr + 1);
> if (!iotlb.perm[is_write]) {
> - section = &phys_sections[phys_section_unassigned].section;
> + section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_unassigned].section;
> break;
> }
>
> @@ -690,12 +698,12 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
> iotlb = (memory_region_get_ram_addr(section->mr) & TARGET_PAGE_MASK)
> + xlat;
> if (!section->readonly) {
> - iotlb |= phys_section_notdirty;
> + iotlb |= cur_pgtbl->phys_section_notdirty;
> } else {
> - iotlb |= phys_section_rom;
> + iotlb |= cur_pgtbl->phys_section_rom;
> }
> } else {
> - iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, section));
> + iotlb = PHYS_SECTION_ID(container_of(section, PhysSection, section), cur_pgtbl);
> iotlb += xlat;
> }
>
> @@ -705,7 +713,7 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
> if (vaddr == (wp->vaddr & TARGET_PAGE_MASK)) {
> /* Avoid trapping reads of pages with a write breakpoint. */
> if ((prot & PAGE_WRITE) || (wp->flags & BP_MEM_READ)) {
> - iotlb = phys_section_watch + paddr;
> + iotlb = cur_pgtbl->phys_section_watch + paddr;
> *address |= TLB_MMIO;
> break;
> }
> @@ -721,59 +729,40 @@ static int subsection_register(PhysSection *psection, uint32_t start,
> uint32_t end, uint16_t section);
> static void subsections_init(PhysSection *psection);
>
> -static void destroy_page_desc(uint16_t section_index)
> +/* Call after all listener has been commit.
> + * we do not walk over tree, just simply drop.
> + */
> +static void destroy_pagetable(PhysPageTable *pgtbl)
> {
> - g_free(phys_sections[section_index].sub_section);
> -}
> -
> -static void destroy_l2_mapping(PhysPageEntry *lp, unsigned level)
> -{
> - unsigned i;
> - PhysPageEntry *p;
> + int i;
>
> - if (lp->ptr == PHYS_MAP_NODE_NIL) {
> - return;
> - }
> + g_free(pgtbl->phys_map_nodes);
>
> - p = phys_map_nodes[lp->ptr];
> - for (i = 0; i < L2_SIZE; ++i) {
> - if (!p[i].is_leaf) {
> - destroy_l2_mapping(&p[i], level - 1);
> + for (i = 0; i < pgtbl->phys_sections_nb_alloc; i++) {
> + if (pgtbl->phys_sections[i].sub_section) {
> + g_free(pgtbl->phys_sections[i].sub_section);
> } else {
> - destroy_page_desc(p[i].ptr);
> + memory_region_unref(pgtbl->phys_sections[i].section.mr);
> }
> }
> - lp->is_leaf = 0;
> - lp->ptr = PHYS_MAP_NODE_NIL;
> -}
> + g_free(pgtbl->phys_sections);
>
> -static void destroy_all_mappings(AddressSpaceDispatch *d)
> -{
> - destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
> - phys_map_nodes_reset();
> + g_free(pgtbl);
> }
This should be a separate patch.
> -static uint16_t phys_section_add(MemoryRegionSection *section)
> +static uint16_t phys_section_add(MemoryRegionSection *section, PhysPageTable *pgtbl)
phys_section_add will always add to next_pgtbl.
> {
> - assert(phys_sections_nb < TARGET_PAGE_SIZE);
> + assert(pgtbl->phys_sections_nb < TARGET_PAGE_SIZE);
>
> - if (phys_sections_nb == phys_sections_nb_alloc) {
> - phys_sections_nb_alloc = MAX(phys_sections_nb_alloc * 2, 16);
> - phys_sections = g_renew(PhysSection, phys_sections,
> - phys_sections_nb_alloc);
> + if (pgtbl->phys_sections_nb == pgtbl->phys_sections_nb_alloc) {
> + pgtbl->phys_sections_nb_alloc = MAX(pgtbl->phys_sections_nb_alloc * 2, 16);
> + pgtbl->phys_sections = g_renew(PhysSection, pgtbl->phys_sections,
> + pgtbl->phys_sections_nb_alloc);
> }
> - phys_sections[phys_sections_nb].section = *section;
> - phys_sections[phys_sections_nb].sub_section = NULL;
> + pgtbl->phys_sections[pgtbl->phys_sections_nb].section = *section;
> + pgtbl->phys_sections[pgtbl->phys_sections_nb].sub_section = NULL;
> memory_region_ref(section->mr);
> - return phys_sections_nb++;
> -}
> -
> -static void phys_sections_clear(void)
> -{
> - while (phys_sections_nb > 0) {
> - PhysSection *phys_section = &phys_sections[--phys_sections_nb];
> - memory_region_unref(phys_section->section.mr);
> - }
> + return pgtbl->phys_sections_nb++;
> }
>
> static void register_subsection(AddressSpaceDispatch *d,
> @@ -793,18 +782,18 @@ static void register_subsection(AddressSpaceDispatch *d,
> psection->section.mr == &io_mem_unassigned);
>
> if (!psection->sub_section) {
> - new_section = phys_section_add(&subsection);
> - psection = &phys_sections[new_section];
> + new_section = phys_section_add(&subsection, next_pgtbl);
> + psection = &next_pgtbl->phys_sections[new_section];
> subsections_init(psection);
> phys_page_set(d, base >> TARGET_PAGE_BITS, 1, new_section);
> } else {
> - new_section = PHYS_SECTION_ID(psection);
> + new_section = PHYS_SECTION_ID(psection, next_pgtbl);
> }
>
> - new_subsection = phys_section_add(section);
> + new_subsection = phys_section_add(section, next_pgtbl);
>
> /* phys_section_add invalidates psection, reload it */
> - psection = &phys_sections[new_section];
> + psection = &next_pgtbl->phys_sections[new_section];
> start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
> end = start + section->size - 1;
> subsection_register(psection, start, end, new_subsection);
> @@ -816,7 +805,7 @@ static void register_multipage(AddressSpaceDispatch *d, MemoryRegionSection *sec
> hwaddr start_addr = section->offset_within_address_space;
> ram_addr_t size = section->size;
> hwaddr addr;
> - uint16_t section_index = phys_section_add(section);
> + uint16_t section_index = phys_section_add(section, next_pgtbl);
>
> assert(size);
>
> @@ -1653,7 +1642,7 @@ static void subsections_init(PhysSection *psection)
> {
> psection->sub_section = g_malloc0(sizeof(uint16_t) * TARGET_PAGE_SIZE);
> subsection_register(psection, 0, TARGET_PAGE_SIZE-1,
> - phys_section_unassigned);
> + next_pgtbl->phys_section_unassigned);
> }
>
> static uint16_t dummy_section(MemoryRegion *mr)
> @@ -1665,12 +1654,12 @@ static uint16_t dummy_section(MemoryRegion *mr)
> .size = UINT64_MAX,
> };
>
> - return phys_section_add(§ion);
> + return phys_section_add(§ion, next_pgtbl);
> }
>
> MemoryRegion *iotlb_to_region(hwaddr index)
> {
> - return phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
> + return cur_pgtbl->phys_sections[index & ~TARGET_PAGE_MASK].section.mr;
> }
>
> static void io_mem_init(void)
> @@ -1685,21 +1674,40 @@ static void io_mem_init(void)
> "watch", UINT64_MAX);
> }
>
> +void global_begin(void)
> +{
> + next_pgtbl = g_new0(PhysPageTable, 1);
> + next_pgtbl->ref = 1;
> + next_pgtbl->phys_section_unassigned = dummy_section(&io_mem_unassigned);
> + next_pgtbl->phys_section_notdirty = dummy_section(&io_mem_notdirty);
> + next_pgtbl->phys_section_rom = dummy_section(&io_mem_rom);
> + next_pgtbl->phys_section_watch = dummy_section(&io_mem_watch);
> +}
> +
> +/* other listeners finished */
> +void global_commit(void)
> +{
> + PhysPageTable *t = cur_pgtbl;
> +
> + cur_pgtbl = next_pgtbl;
> + /* Fix me, currently, we rely on each address space listener agaist its
> + * reader. So when we come here, no readers will touch old phys_map_node.
> + * After rcu, should changed to call_rcu()
> + */
> + if (__sync_sub_and_fetch(&t->ref, 1) == 0) {
> + destroy_pagetable(t);
> + }
global_commit should not be needed, and global_begin should be simply
the new core_begin. It should unref next_pgtbl and reallocate a new one.
> +}
> +
> static void mem_begin(MemoryListener *listener)
> {
> AddressSpaceDispatch *d = container_of(listener, AddressSpaceDispatch, listener);
>
> - destroy_all_mappings(d);
> d->phys_map.ptr = PHYS_MAP_NODE_NIL;
> }
>
> static void core_begin(MemoryListener *listener)
> {
> - phys_sections_clear();
> - phys_section_unassigned = dummy_section(&io_mem_unassigned);
> - phys_section_notdirty = dummy_section(&io_mem_notdirty);
> - phys_section_rom = dummy_section(&io_mem_rom);
> - phys_section_watch = dummy_section(&io_mem_watch);
> }
>
> static void tcg_commit(MemoryListener *listener)
> @@ -1779,7 +1787,6 @@ void address_space_destroy_dispatch(AddressSpace *as)
> AddressSpaceDispatch *d = as->dispatch;
>
> memory_listener_unregister(&d->listener);
> - destroy_l2_mapping(&d->phys_map, P_L2_LEVELS - 1);
> g_free(d);
> as->dispatch = NULL;
> }
> @@ -2386,7 +2393,7 @@ void stl_phys_notdirty(hwaddr addr, uint32_t val)
>
> if (!memory_region_is_ram(section->mr) || section->readonly) {
> if (memory_region_is_ram(section->mr)) {
> - section = &phys_sections[phys_section_rom].section;
> + section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
> }
> io_mem_write(section->mr, addr, val, 4);
> } else {
> @@ -2422,7 +2429,7 @@ void stq_phys_notdirty(hwaddr addr, uint64_t val)
>
> if (!memory_region_is_ram(section->mr) || section->readonly) {
> if (memory_region_is_ram(section->mr)) {
> - section = &phys_sections[phys_section_rom].section;
> + section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
> }
> #ifdef TARGET_WORDS_BIGENDIAN
> io_mem_write(section->mr, addr, val >> 32, 4);
> @@ -2455,7 +2462,7 @@ static inline void stl_phys_internal(hwaddr addr, uint32_t val,
>
> if (!memory_region_is_ram(section->mr) || section->readonly) {
> if (memory_region_is_ram(section->mr)) {
> - section = &phys_sections[phys_section_rom].section;
> + section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
> }
> #if defined(TARGET_WORDS_BIGENDIAN)
> if (endian == DEVICE_LITTLE_ENDIAN) {
> @@ -2526,7 +2533,7 @@ static inline void stw_phys_internal(hwaddr addr, uint32_t val,
>
> if (!memory_region_is_ram(section->mr) || section->readonly) {
> if (memory_region_is_ram(section->mr)) {
> - section = &phys_sections[phys_section_rom].section;
> + section = &cur_pgtbl->phys_sections[cur_pgtbl->phys_section_rom].section;
> }
> #if defined(TARGET_WORDS_BIGENDIAN)
> if (endian == DEVICE_LITTLE_ENDIAN) {
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index b97ace7..cc654fa 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -992,6 +992,8 @@ void *address_space_map(AddressSpace *as, hwaddr addr,
> void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
> int is_write, hwaddr access_len);
>
> +void global_begin(void);
> +void global_commit(void);
>
> #endif
>
> diff --git a/memory.c b/memory.c
> index 1a86607..da06dfd 100644
> --- a/memory.c
> +++ b/memory.c
> @@ -805,6 +805,7 @@ void memory_region_transaction_commit(void)
> --memory_region_transaction_depth;
> if (!memory_region_transaction_depth && memory_region_update_pending) {
> memory_region_update_pending = false;
> + global_begin();
> MEMORY_LISTENER_CALL_GLOBAL(begin, Forward);
>
> QTAILQ_FOREACH(as, &address_spaces, address_spaces_link) {
> @@ -812,6 +813,7 @@ void memory_region_transaction_commit(void)
> }
>
> MEMORY_LISTENER_CALL_GLOBAL(commit, Forward);
> + global_commit();
> }
> }
>
>
next prev parent reply other threads:[~2013-05-13 9:21 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-05-13 3:21 [Qemu-devel] [RFC PATCH 0/2] make memory listener prepared for rcu style Liu Ping Fan
2013-05-13 3:21 ` [Qemu-devel] [RFC PATCH 1/2] mem: make phys_section and phys_map_nodes prepared for RCU Liu Ping Fan
2013-05-13 9:20 ` Paolo Bonzini [this message]
2013-05-14 3:38 ` liu ping fan
2013-05-14 9:27 ` Paolo Bonzini
2013-05-15 7:04 ` liu ping fan
2013-05-26 13:02 ` liu ping fan
2013-05-27 11:54 ` Paolo Bonzini
2013-05-29 1:52 ` liu ping fan
2013-05-13 3:21 ` [Qemu-devel] [RFC PATCH 2/2] mem: prepare address_space listener rcu style Liu Ping Fan
2013-05-13 9:31 ` Paolo Bonzini
2013-05-14 5:47 ` liu ping fan
2013-05-14 9:34 ` Paolo Bonzini
2013-05-15 1:29 ` liu ping fan
2013-05-15 8:22 ` Paolo Bonzini
2013-05-15 9:11 ` liu ping fan
2013-05-15 9:19 ` Paolo Bonzini
2013-05-16 9:09 ` Paolo Bonzini
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=5190B07B.3040605@redhat.com \
--to=pbonzini@redhat.com \
--cc=anthony@codemonkey.ws \
--cc=jan.kiszka@siemens.com \
--cc=mst@redhat.com \
--cc=peter.maydell@linaro.org \
--cc=qemu-devel@nongnu.org \
--cc=qemulist@gmail.com \
--cc=stefanha@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.