From: "Michael S. Tsirkin" <mst@redhat.com>
To: Marcel Apfelbaum <marcel.a@redhat.com>
Cc: peter.maydell@linaro.org, jan.kiszka@siemens.com,
qemu-devel@nongnu.org, qemu-stable@nongnu.org,
pbonzini@redhat.com, afaerber@suse.de, rth@twiddle.net
Subject: Re: [Qemu-devel] [PATCH] exec: separate sections and nodes per address space
Date: Mon, 2 Dec 2013 15:31:46 +0200 [thread overview]
Message-ID: <20131202133146.GA18457@redhat.com> (raw)
In-Reply-To: <1385899343-24169-1-git-send-email-marcel.a@redhat.com>
On Sun, Dec 01, 2013 at 02:02:23PM +0200, Marcel Apfelbaum wrote:
> Every address space has its own nodes and sections, but
> it uses the same global arrays of nodes/section.
>
> This limits the number of devices that can be attached
> to the guest to 20-30 devices. It happens because:
> - The sections array is limited to 2^12 entries.
> - The main memory has at least 100 sections.
> - Each device address space is actually an alias to
> main memory, multiplying its number of nodes/sections.
>
> Remove the limitation by using separate arrays of
> nodes and sections for each address space.
>
> Signed-off-by: Marcel Apfelbaum <marcel.a@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
I think this is actually needed for stable branches:
this failure to support large number of devices is
a regression since 1.3.0
Cc: qemu-stable@nongnu.org
> ---
> exec.c | 151 ++++++++++++++++++++++++++++-------------------------------------
> 1 file changed, 64 insertions(+), 87 deletions(-)
>
> diff --git a/exec.c b/exec.c
> index 95c4356..db2844c 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -90,13 +90,21 @@ struct PhysPageEntry {
>
> typedef PhysPageEntry Node[L2_SIZE];
>
> +typedef struct PhysPageMap {
> + unsigned sections_nb;
> + unsigned sections_nb_alloc;
> + unsigned nodes_nb;
> + unsigned nodes_nb_alloc;
> + Node *nodes;
> + MemoryRegionSection *sections;
> +} PhysPageMap;
> +
> struct AddressSpaceDispatch {
> /* This is a multi-level map on the physical address space.
> * The bottom level has pointers to MemoryRegionSections.
> */
> PhysPageEntry phys_map;
> - Node *nodes;
> - MemoryRegionSection *sections;
> + PhysPageMap map;
> AddressSpace *as;
> };
>
> @@ -113,18 +121,6 @@ typedef struct subpage_t {
> #define PHYS_SECTION_ROM 2
> #define PHYS_SECTION_WATCH 3
>
> -typedef struct PhysPageMap {
> - unsigned sections_nb;
> - unsigned sections_nb_alloc;
> - unsigned nodes_nb;
> - unsigned nodes_nb_alloc;
> - Node *nodes;
> - MemoryRegionSection *sections;
> -} PhysPageMap;
> -
> -static PhysPageMap *prev_map;
> -static PhysPageMap next_map;
> -
> #define PHYS_MAP_NODE_NIL (((uint16_t)~0) >> 1)
>
> static void io_mem_init(void);
> @@ -135,35 +131,32 @@ static MemoryRegion io_mem_watch;
>
> #if !defined(CONFIG_USER_ONLY)
>
> -static void phys_map_node_reserve(unsigned nodes)
> +static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
> {
> - if (next_map.nodes_nb + nodes > next_map.nodes_nb_alloc) {
> - next_map.nodes_nb_alloc = MAX(next_map.nodes_nb_alloc * 2,
> - 16);
> - next_map.nodes_nb_alloc = MAX(next_map.nodes_nb_alloc,
> - next_map.nodes_nb + nodes);
> - next_map.nodes = g_renew(Node, next_map.nodes,
> - next_map.nodes_nb_alloc);
> + if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
> + map->nodes_nb_alloc = MAX(map->nodes_nb_alloc * 2, 16);
> + map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
> + map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
> }
> }
>
> -static uint16_t phys_map_node_alloc(void)
> +static uint16_t phys_map_node_alloc(PhysPageMap *map)
> {
> unsigned i;
> uint16_t ret;
>
> - ret = next_map.nodes_nb++;
> + ret = map->nodes_nb++;
> assert(ret != PHYS_MAP_NODE_NIL);
> - assert(ret != next_map.nodes_nb_alloc);
> + assert(ret != map->nodes_nb_alloc);
> for (i = 0; i < L2_SIZE; ++i) {
> - next_map.nodes[ret][i].is_leaf = 0;
> - next_map.nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
> + map->nodes[ret][i].is_leaf = 0;
> + map->nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
> }
> return ret;
> }
>
> -static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
> - hwaddr *nb, uint16_t leaf,
> +static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
> + hwaddr *index, hwaddr *nb, uint16_t leaf,
> int level)
> {
> PhysPageEntry *p;
> @@ -171,8 +164,8 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
> hwaddr step = (hwaddr)1 << (level * L2_BITS);
>
> if (!lp->is_leaf && lp->ptr == PHYS_MAP_NODE_NIL) {
> - lp->ptr = phys_map_node_alloc();
> - p = next_map.nodes[lp->ptr];
> + lp->ptr = phys_map_node_alloc(map);
> + p = map->nodes[lp->ptr];
> if (level == 0) {
> for (i = 0; i < L2_SIZE; i++) {
> p[i].is_leaf = 1;
> @@ -180,7 +173,7 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
> }
> }
> } else {
> - p = next_map.nodes[lp->ptr];
> + p = map->nodes[lp->ptr];
> }
> lp = &p[(*index >> (level * L2_BITS)) & (L2_SIZE - 1)];
>
> @@ -191,7 +184,7 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
> *index += step;
> *nb -= step;
> } else {
> - phys_page_set_level(lp, index, nb, leaf, level - 1);
> + phys_page_set_level(map, lp, index, nb, leaf, level - 1);
> }
> ++lp;
> }
> @@ -202,9 +195,9 @@ static void phys_page_set(AddressSpaceDispatch *d,
> uint16_t leaf)
> {
> /* Wildly overreserve - it doesn't matter much. */
> - phys_map_node_reserve(3 * P_L2_LEVELS);
> + phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
>
> - phys_page_set_level(&d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
> + phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
> }
>
> static MemoryRegionSection *phys_page_find(PhysPageEntry lp, hwaddr index,
> @@ -237,10 +230,10 @@ static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
> subpage_t *subpage;
>
> section = phys_page_find(d->phys_map, addr >> TARGET_PAGE_BITS,
> - d->nodes, d->sections);
> + d->map.nodes, d->map.sections);
> if (resolve_subpage && section->mr->subpage) {
> subpage = container_of(section->mr, subpage_t, iomem);
> - section = &d->sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
> + section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
> }
> return section;
> }
> @@ -708,7 +701,7 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
> iotlb |= PHYS_SECTION_ROM;
> }
> } else {
> - iotlb = section - address_space_memory.dispatch->sections;
> + iotlb = section - address_space_memory.dispatch->map.sections;
> iotlb += xlat;
> }
>
> @@ -747,23 +740,23 @@ void phys_mem_set_alloc(void *(*alloc)(size_t))
> phys_mem_alloc = alloc;
> }
>
> -static uint16_t phys_section_add(MemoryRegionSection *section)
> +static uint16_t phys_section_add(PhysPageMap *map,
> + MemoryRegionSection *section)
> {
> /* The physical section number is ORed with a page-aligned
> * pointer to produce the iotlb entries. Thus it should
> * never overflow into the page-aligned value.
> */
> - assert(next_map.sections_nb < TARGET_PAGE_SIZE);
> + assert(map->sections_nb < TARGET_PAGE_SIZE);
>
> - if (next_map.sections_nb == next_map.sections_nb_alloc) {
> - next_map.sections_nb_alloc = MAX(next_map.sections_nb_alloc * 2,
> - 16);
> - next_map.sections = g_renew(MemoryRegionSection, next_map.sections,
> - next_map.sections_nb_alloc);
> + if (map->sections_nb == map->sections_nb_alloc) {
> + map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
> + map->sections = g_renew(MemoryRegionSection, map->sections,
> + map->sections_nb_alloc);
> }
> - next_map.sections[next_map.sections_nb] = *section;
> + map->sections[map->sections_nb] = *section;
> memory_region_ref(section->mr);
> - return next_map.sections_nb++;
> + return map->sections_nb++;
> }
>
> static void phys_section_destroy(MemoryRegion *mr)
> @@ -785,7 +778,6 @@ static void phys_sections_free(PhysPageMap *map)
> }
> g_free(map->sections);
> g_free(map->nodes);
> - g_free(map);
> }
>
> static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *section)
> @@ -794,7 +786,7 @@ static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *secti
> hwaddr base = section->offset_within_address_space
> & TARGET_PAGE_MASK;
> MemoryRegionSection *existing = phys_page_find(d->phys_map, base >> TARGET_PAGE_BITS,
> - next_map.nodes, next_map.sections);
> + d->map.nodes, d->map.sections);
> MemoryRegionSection subsection = {
> .offset_within_address_space = base,
> .size = int128_make64(TARGET_PAGE_SIZE),
> @@ -807,13 +799,14 @@ static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *secti
> subpage = subpage_init(d->as, base);
> subsection.mr = &subpage->iomem;
> phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
> - phys_section_add(&subsection));
> + phys_section_add(&d->map, &subsection));
> } else {
> subpage = container_of(existing->mr, subpage_t, iomem);
> }
> start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
> end = start + int128_get64(section->size) - 1;
> - subpage_register(subpage, start, end, phys_section_add(section));
> + subpage_register(subpage, start, end,
> + phys_section_add(&d->map, section));
> }
>
>
> @@ -821,7 +814,7 @@ static void register_multipage(AddressSpaceDispatch *d,
> MemoryRegionSection *section)
> {
> hwaddr start_addr = section->offset_within_address_space;
> - uint16_t section_index = phys_section_add(section);
> + uint16_t section_index = phys_section_add(&d->map, section);
> uint64_t num_pages = int128_get64(int128_rshift(section->size,
> TARGET_PAGE_BITS));
>
> @@ -1605,7 +1598,7 @@ static subpage_t *subpage_init(AddressSpace *as, hwaddr base)
> return mmio;
> }
>
> -static uint16_t dummy_section(MemoryRegion *mr)
> +static uint16_t dummy_section(PhysPageMap *map, MemoryRegion *mr)
> {
> MemoryRegionSection section = {
> .mr = mr,
> @@ -1614,12 +1607,13 @@ static uint16_t dummy_section(MemoryRegion *mr)
> .size = int128_2_64(),
> };
>
> - return phys_section_add(§ion);
> + return phys_section_add(map, §ion);
> }
>
> MemoryRegion *iotlb_to_region(hwaddr index)
> {
> - return address_space_memory.dispatch->sections[index & ~TARGET_PAGE_MASK].mr;
> + return address_space_memory.dispatch->map.sections[
> + index & ~TARGET_PAGE_MASK].mr;
> }
>
> static void io_mem_init(void)
> @@ -1636,7 +1630,17 @@ static void io_mem_init(void)
> static void mem_begin(MemoryListener *listener)
> {
> AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
> - AddressSpaceDispatch *d = g_new(AddressSpaceDispatch, 1);
> + AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
> + uint16_t n;
> +
> + n = dummy_section(&d->map, &io_mem_unassigned);
> + assert(n == PHYS_SECTION_UNASSIGNED);
> + n = dummy_section(&d->map, &io_mem_notdirty);
> + assert(n == PHYS_SECTION_NOTDIRTY);
> + n = dummy_section(&d->map, &io_mem_rom);
> + assert(n == PHYS_SECTION_ROM);
> + n = dummy_section(&d->map, &io_mem_watch);
> + assert(n == PHYS_SECTION_WATCH);
>
> d->phys_map = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .is_leaf = 0 };
> d->as = as;
> @@ -1649,37 +1653,12 @@ static void mem_commit(MemoryListener *listener)
> AddressSpaceDispatch *cur = as->dispatch;
> AddressSpaceDispatch *next = as->next_dispatch;
>
> - next->nodes = next_map.nodes;
> - next->sections = next_map.sections;
> -
> as->dispatch = next;
> - g_free(cur);
> -}
> -
> -static void core_begin(MemoryListener *listener)
> -{
> - uint16_t n;
>
> - prev_map = g_new(PhysPageMap, 1);
> - *prev_map = next_map;
> -
> - memset(&next_map, 0, sizeof(next_map));
> - n = dummy_section(&io_mem_unassigned);
> - assert(n == PHYS_SECTION_UNASSIGNED);
> - n = dummy_section(&io_mem_notdirty);
> - assert(n == PHYS_SECTION_NOTDIRTY);
> - n = dummy_section(&io_mem_rom);
> - assert(n == PHYS_SECTION_ROM);
> - n = dummy_section(&io_mem_watch);
> - assert(n == PHYS_SECTION_WATCH);
> -}
> -
> -/* This listener's commit run after the other AddressSpaceDispatch listeners'.
> - * All AddressSpaceDispatch instances have switched to the next map.
> - */
> -static void core_commit(MemoryListener *listener)
> -{
> - phys_sections_free(prev_map);
> + if (cur) {
> + phys_sections_free(&cur->map);
> + g_free(cur);
> + }
> }
>
> static void tcg_commit(MemoryListener *listener)
> @@ -1707,8 +1686,6 @@ static void core_log_global_stop(MemoryListener *listener)
> }
>
> static MemoryListener core_memory_listener = {
> - .begin = core_begin,
> - .commit = core_commit,
> .log_global_start = core_log_global_start,
> .log_global_stop = core_log_global_stop,
> .priority = 1,
> --
> 1.8.3.1
next prev parent reply other threads:[~2013-12-02 13:28 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-12-01 12:02 [Qemu-devel] [PATCH] exec: separate sections and nodes per address space Marcel Apfelbaum
2013-12-02 13:31 ` Michael S. Tsirkin [this message]
2013-12-09 18:02 ` Paolo Bonzini
2013-12-10 11:49 ` Michael S. Tsirkin
2013-12-10 12:37 ` Marcel Apfelbaum
2013-12-10 12:38 ` Paolo Bonzini
2013-12-10 12:50 ` Marcel Apfelbaum
2013-12-10 16:48 ` Michael S. Tsirkin
2013-12-11 9:19 ` Marcel Apfelbaum
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20131202133146.GA18457@redhat.com \
--to=mst@redhat.com \
--cc=afaerber@suse.de \
--cc=jan.kiszka@siemens.com \
--cc=marcel.a@redhat.com \
--cc=pbonzini@redhat.com \
--cc=peter.maydell@linaro.org \
--cc=qemu-devel@nongnu.org \
--cc=qemu-stable@nongnu.org \
--cc=rth@twiddle.net \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).