qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: "Michael S. Tsirkin" <mst@redhat.com>
To: qemu-devel@nongnu.org
Cc: Paolo Bonzini <pbonzini@redhat.com>,
	Marcel Apfelbaum <marcel.a@redhat.com>
Subject: [Qemu-devel] [PULL 24/28] exec: separate sections and nodes per address space
Date: Wed, 11 Dec 2013 20:31:28 +0200	[thread overview]
Message-ID: <1386786509-29966-24-git-send-email-mst@redhat.com> (raw)
In-Reply-To: <cover.1386786228.git.mst@redhat.com>

From: Marcel Apfelbaum <marcel.a@redhat.com>

Every address space has its own nodes and sections, but
it uses the same global arrays of nodes/section.

This limits the number of devices that can be attached
to the guest to 20-30 devices. It happens because:
 - The sections array is limited to 2^12 entries.
 - The main memory has at least 100 sections.
 - Each device address space is actually an alias to
   main memory, multiplying its number of nodes/sections.

Remove the limitation by using separate arrays of
nodes and sections for each address space.

Signed-off-by: Marcel Apfelbaum <marcel.a@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 exec.c | 155 ++++++++++++++++++++++++++++-------------------------------------
 1 file changed, 66 insertions(+), 89 deletions(-)

diff --git a/exec.c b/exec.c
index 67a073c..00526d1 100644
--- a/exec.c
+++ b/exec.c
@@ -103,13 +103,21 @@ struct PhysPageEntry {
 
 typedef PhysPageEntry Node[P_L2_SIZE];
 
+typedef struct PhysPageMap {
+    unsigned sections_nb;
+    unsigned sections_nb_alloc;
+    unsigned nodes_nb;
+    unsigned nodes_nb_alloc;
+    Node *nodes;
+    MemoryRegionSection *sections;
+} PhysPageMap;
+
 struct AddressSpaceDispatch {
     /* This is a multi-level map on the physical address space.
      * The bottom level has pointers to MemoryRegionSections.
      */
     PhysPageEntry phys_map;
-    Node *nodes;
-    MemoryRegionSection *sections;
+    PhysPageMap map;
     AddressSpace *as;
 };
 
@@ -126,18 +134,6 @@ typedef struct subpage_t {
 #define PHYS_SECTION_ROM 2
 #define PHYS_SECTION_WATCH 3
 
-typedef struct PhysPageMap {
-    unsigned sections_nb;
-    unsigned sections_nb_alloc;
-    unsigned nodes_nb;
-    unsigned nodes_nb_alloc;
-    Node *nodes;
-    MemoryRegionSection *sections;
-} PhysPageMap;
-
-static PhysPageMap *prev_map;
-static PhysPageMap next_map;
-
 static void io_mem_init(void);
 static void memory_map_init(void);
 
@@ -146,35 +142,32 @@ static MemoryRegion io_mem_watch;
 
 #if !defined(CONFIG_USER_ONLY)
 
-static void phys_map_node_reserve(unsigned nodes)
+static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
 {
-    if (next_map.nodes_nb + nodes > next_map.nodes_nb_alloc) {
-        next_map.nodes_nb_alloc = MAX(next_map.nodes_nb_alloc * 2,
-                                            16);
-        next_map.nodes_nb_alloc = MAX(next_map.nodes_nb_alloc,
-                                      next_map.nodes_nb + nodes);
-        next_map.nodes = g_renew(Node, next_map.nodes,
-                                 next_map.nodes_nb_alloc);
+    if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
+        map->nodes_nb_alloc = MAX(map->nodes_nb_alloc * 2, 16);
+        map->nodes_nb_alloc = MAX(map->nodes_nb_alloc, map->nodes_nb + nodes);
+        map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
     }
 }
 
-static uint32_t phys_map_node_alloc(void)
+static uint32_t phys_map_node_alloc(PhysPageMap *map)
 {
     unsigned i;
     uint32_t ret;
 
-    ret = next_map.nodes_nb++;
+    ret = map->nodes_nb++;
     assert(ret != PHYS_MAP_NODE_NIL);
-    assert(ret != next_map.nodes_nb_alloc);
+    assert(ret != map->nodes_nb_alloc);
     for (i = 0; i < P_L2_SIZE; ++i) {
-        next_map.nodes[ret][i].skip = 1;
-        next_map.nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
+        map->nodes[ret][i].skip = 1;
+        map->nodes[ret][i].ptr = PHYS_MAP_NODE_NIL;
     }
     return ret;
 }
 
-static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
-                                hwaddr *nb, uint16_t leaf,
+static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
+                                hwaddr *index, hwaddr *nb, uint16_t leaf,
                                 int level)
 {
     PhysPageEntry *p;
@@ -182,8 +175,8 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
     hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
 
     if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
-        lp->ptr = phys_map_node_alloc();
-        p = next_map.nodes[lp->ptr];
+        lp->ptr = phys_map_node_alloc(map);
+        p = map->nodes[lp->ptr];
         if (level == 0) {
             for (i = 0; i < P_L2_SIZE; i++) {
                 p[i].skip = 0;
@@ -191,7 +184,7 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
             }
         }
     } else {
-        p = next_map.nodes[lp->ptr];
+        p = map->nodes[lp->ptr];
     }
     lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
 
@@ -202,7 +195,7 @@ static void phys_page_set_level(PhysPageEntry *lp, hwaddr *index,
             *index += step;
             *nb -= step;
         } else {
-            phys_page_set_level(lp, index, nb, leaf, level - 1);
+            phys_page_set_level(map, lp, index, nb, leaf, level - 1);
         }
         ++lp;
     }
@@ -213,9 +206,9 @@ static void phys_page_set(AddressSpaceDispatch *d,
                           uint16_t leaf)
 {
     /* Wildly overreserve - it doesn't matter much. */
-    phys_map_node_reserve(3 * P_L2_LEVELS);
+    phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
 
-    phys_page_set_level(&d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
+    phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
 }
 
 /* Compact a non leaf page entry. Simply detect that the entry has a single child,
@@ -276,7 +269,7 @@ static void phys_page_compact_all(AddressSpaceDispatch *d, int nodes_nb)
     DECLARE_BITMAP(compacted, nodes_nb);
 
     if (d->phys_map.skip) {
-        phys_page_compact(&d->phys_map, d->nodes, compacted);
+        phys_page_compact(&d->phys_map, d->map.nodes, compacted);
     }
 }
 
@@ -317,10 +310,10 @@ static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
     MemoryRegionSection *section;
     subpage_t *subpage;
 
-    section = phys_page_find(d->phys_map, addr, d->nodes, d->sections);
+    section = phys_page_find(d->phys_map, addr, d->map.nodes, d->map.sections);
     if (resolve_subpage && section->mr->subpage) {
         subpage = container_of(section->mr, subpage_t, iomem);
-        section = &d->sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
+        section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
     }
     return section;
 }
@@ -788,7 +781,7 @@ hwaddr memory_region_section_get_iotlb(CPUArchState *env,
             iotlb |= PHYS_SECTION_ROM;
         }
     } else {
-        iotlb = section - address_space_memory.dispatch->sections;
+        iotlb = section - address_space_memory.dispatch->map.sections;
         iotlb += xlat;
     }
 
@@ -827,23 +820,23 @@ void phys_mem_set_alloc(void *(*alloc)(size_t))
     phys_mem_alloc = alloc;
 }
 
-static uint16_t phys_section_add(MemoryRegionSection *section)
+static uint16_t phys_section_add(PhysPageMap *map,
+                                 MemoryRegionSection *section)
 {
     /* The physical section number is ORed with a page-aligned
      * pointer to produce the iotlb entries.  Thus it should
      * never overflow into the page-aligned value.
      */
-    assert(next_map.sections_nb < TARGET_PAGE_SIZE);
+    assert(map->sections_nb < TARGET_PAGE_SIZE);
 
-    if (next_map.sections_nb == next_map.sections_nb_alloc) {
-        next_map.sections_nb_alloc = MAX(next_map.sections_nb_alloc * 2,
-                                         16);
-        next_map.sections = g_renew(MemoryRegionSection, next_map.sections,
-                                    next_map.sections_nb_alloc);
+    if (map->sections_nb == map->sections_nb_alloc) {
+        map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
+        map->sections = g_renew(MemoryRegionSection, map->sections,
+                                map->sections_nb_alloc);
     }
-    next_map.sections[next_map.sections_nb] = *section;
+    map->sections[map->sections_nb] = *section;
     memory_region_ref(section->mr);
-    return next_map.sections_nb++;
+    return map->sections_nb++;
 }
 
 static void phys_section_destroy(MemoryRegion *mr)
@@ -865,7 +858,6 @@ static void phys_sections_free(PhysPageMap *map)
     }
     g_free(map->sections);
     g_free(map->nodes);
-    g_free(map);
 }
 
 static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *section)
@@ -874,7 +866,7 @@ static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *secti
     hwaddr base = section->offset_within_address_space
         & TARGET_PAGE_MASK;
     MemoryRegionSection *existing = phys_page_find(d->phys_map, base,
-                                                   next_map.nodes, next_map.sections);
+                                                   d->map.nodes, d->map.sections);
     MemoryRegionSection subsection = {
         .offset_within_address_space = base,
         .size = int128_make64(TARGET_PAGE_SIZE),
@@ -887,13 +879,14 @@ static void register_subpage(AddressSpaceDispatch *d, MemoryRegionSection *secti
         subpage = subpage_init(d->as, base);
         subsection.mr = &subpage->iomem;
         phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
-                      phys_section_add(&subsection));
+                      phys_section_add(&d->map, &subsection));
     } else {
         subpage = container_of(existing->mr, subpage_t, iomem);
     }
     start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
     end = start + int128_get64(section->size) - 1;
-    subpage_register(subpage, start, end, phys_section_add(section));
+    subpage_register(subpage, start, end,
+                     phys_section_add(&d->map, section));
 }
 
 
@@ -901,7 +894,7 @@ static void register_multipage(AddressSpaceDispatch *d,
                                MemoryRegionSection *section)
 {
     hwaddr start_addr = section->offset_within_address_space;
-    uint16_t section_index = phys_section_add(section);
+    uint16_t section_index = phys_section_add(&d->map, section);
     uint64_t num_pages = int128_get64(int128_rshift(section->size,
                                                     TARGET_PAGE_BITS));
 
@@ -1720,7 +1713,7 @@ static subpage_t *subpage_init(AddressSpace *as, hwaddr base)
     return mmio;
 }
 
-static uint16_t dummy_section(MemoryRegion *mr)
+static uint16_t dummy_section(PhysPageMap *map, MemoryRegion *mr)
 {
     MemoryRegionSection section = {
         .mr = mr,
@@ -1729,12 +1722,13 @@ static uint16_t dummy_section(MemoryRegion *mr)
         .size = int128_2_64(),
     };
 
-    return phys_section_add(&section);
+    return phys_section_add(map, &section);
 }
 
 MemoryRegion *iotlb_to_region(hwaddr index)
 {
-    return address_space_memory.dispatch->sections[index & ~TARGET_PAGE_MASK].mr;
+    return address_space_memory.dispatch->map.sections[
+           index & ~TARGET_PAGE_MASK].mr;
 }
 
 static void io_mem_init(void)
@@ -1751,7 +1745,17 @@ static void io_mem_init(void)
 static void mem_begin(MemoryListener *listener)
 {
     AddressSpace *as = container_of(listener, AddressSpace, dispatch_listener);
-    AddressSpaceDispatch *d = g_new(AddressSpaceDispatch, 1);
+    AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
+    uint16_t n;
+
+    n = dummy_section(&d->map, &io_mem_unassigned);
+    assert(n == PHYS_SECTION_UNASSIGNED);
+    n = dummy_section(&d->map, &io_mem_notdirty);
+    assert(n == PHYS_SECTION_NOTDIRTY);
+    n = dummy_section(&d->map, &io_mem_rom);
+    assert(n == PHYS_SECTION_ROM);
+    n = dummy_section(&d->map, &io_mem_watch);
+    assert(n == PHYS_SECTION_WATCH);
 
     d->phys_map  = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
     d->as = as;
@@ -1764,39 +1768,14 @@ static void mem_commit(MemoryListener *listener)
     AddressSpaceDispatch *cur = as->dispatch;
     AddressSpaceDispatch *next = as->next_dispatch;
 
-    next->nodes = next_map.nodes;
-    next->sections = next_map.sections;
-
-    phys_page_compact_all(next, next_map.nodes_nb);
+    phys_page_compact_all(next, next->map.nodes_nb);
 
     as->dispatch = next;
-    g_free(cur);
-}
-
-static void core_begin(MemoryListener *listener)
-{
-    uint16_t n;
 
-    prev_map = g_new(PhysPageMap, 1);
-    *prev_map = next_map;
-
-    memset(&next_map, 0, sizeof(next_map));
-    n = dummy_section(&io_mem_unassigned);
-    assert(n == PHYS_SECTION_UNASSIGNED);
-    n = dummy_section(&io_mem_notdirty);
-    assert(n == PHYS_SECTION_NOTDIRTY);
-    n = dummy_section(&io_mem_rom);
-    assert(n == PHYS_SECTION_ROM);
-    n = dummy_section(&io_mem_watch);
-    assert(n == PHYS_SECTION_WATCH);
-}
-
-/* This listener's commit run after the other AddressSpaceDispatch listeners'.
- * All AddressSpaceDispatch instances have switched to the next map.
- */
-static void core_commit(MemoryListener *listener)
-{
-    phys_sections_free(prev_map);
+    if (cur) {
+        phys_sections_free(&cur->map);
+        g_free(cur);
+    }
 }
 
 static void tcg_commit(MemoryListener *listener)
@@ -1824,8 +1803,6 @@ static void core_log_global_stop(MemoryListener *listener)
 }
 
 static MemoryListener core_memory_listener = {
-    .begin = core_begin,
-    .commit = core_commit,
     .log_global_start = core_log_global_start,
     .log_global_stop = core_log_global_stop,
     .priority = 1,
-- 
MST

  parent reply	other threads:[~2013-12-11 18:28 UTC|newest]

Thread overview: 74+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-12-11 18:30 [Qemu-devel] [PULL 00/28] acpi.pci,pc,memory core fixes Michael S. Tsirkin
2013-12-11 18:30 ` [Qemu-devel] [PULL 01/28] hw: Pass QEMUMachine to its init() method Michael S. Tsirkin
2013-12-11 18:30 ` [Qemu-devel] [PULL 02/28] pc: map PCI address space as catchall region for not mapped addresses Michael S. Tsirkin
2013-12-11 18:30 ` [Qemu-devel] [PULL 03/28] qtest: split configuration of qtest accelerator and chardev Michael S. Tsirkin
2013-12-11 18:30 ` [Qemu-devel] [PULL 04/28] acpi-test: basic acpi unit-test Michael S. Tsirkin
2013-12-11 18:30 ` [Qemu-devel] [PULL 05/28] MAINTAINERS: update X86 machine entry Michael S. Tsirkin
2013-12-11 18:30 ` [Qemu-devel] [PULL 06/28] pci: fix address space size for bridge Michael S. Tsirkin
2013-12-11 18:30 ` [Qemu-devel] [PULL 07/28] pc: s/INT64_MAX/UINT64_MAX/ Michael S. Tsirkin
2013-12-11 18:30 ` [Qemu-devel] [PULL 08/28] spapr_pci: s/INT64_MAX/UINT64_MAX/ Michael S. Tsirkin
2013-12-11 18:30 ` [Qemu-devel] [PULL 09/28] split definitions for exec.c and translate-all.c radix trees Michael S. Tsirkin
2013-12-11 18:30 ` [Qemu-devel] [PULL 10/28] exec: replace leaf with skip Michael S. Tsirkin
2013-12-11 18:30 ` [Qemu-devel] [PULL 11/28] exec: extend skip field to 6 bit, page entry to 32 bit Michael S. Tsirkin
2013-12-11 18:30 ` [Qemu-devel] [PULL 12/28] exec: pass hw address to phys_page_find Michael S. Tsirkin
2013-12-11 18:30 ` [Qemu-devel] [PULL 13/28] exec: memory radix tree page level compression Michael S. Tsirkin
2013-12-11 18:30 ` [Qemu-devel] [PULL 14/28] exec: make address spaces 64-bit wide Michael S. Tsirkin
2014-01-09 17:24   ` Alex Williamson
2014-01-09 18:00     ` Michael S. Tsirkin
2014-01-09 18:47       ` Alex Williamson
2014-01-09 19:03         ` Alex Williamson
2014-01-09 21:56           ` Michael S. Tsirkin
2014-01-09 22:42             ` Alex Williamson
2014-01-10 12:55               ` Michael S. Tsirkin
2014-01-10 15:31                 ` Alex Williamson
2014-01-12  7:54                   ` Michael S. Tsirkin
2014-01-12 15:03                     ` Alexander Graf
2014-01-13 21:39                       ` Alex Williamson
2014-01-13 21:48                         ` Alexander Graf
2014-01-13 22:48                           ` Alex Williamson
2014-01-14 10:24                             ` Avi Kivity
2014-01-14 11:50                               ` Michael S. Tsirkin
2014-01-14 15:36                               ` Alex Williamson
2014-01-14 16:20                                 ` Michael S. Tsirkin
2014-01-14 12:07                             ` Michael S. Tsirkin
2014-01-14 15:57                               ` Alex Williamson
2014-01-14 16:03                                 ` Michael S. Tsirkin
2014-01-14 16:15                                   ` Alex Williamson
2014-01-14 16:18                                     ` Michael S. Tsirkin
2014-01-14 16:39                                       ` Alex Williamson
2014-01-14 16:45                                         ` Michael S. Tsirkin
2014-01-14  8:18                           ` Michael S. Tsirkin
2014-01-14  9:20                             ` Alexander Graf
2014-01-14  9:31                               ` Peter Maydell
2014-01-14 10:28                               ` Michael S. Tsirkin
2014-01-14 10:43                               ` Michael S. Tsirkin
2014-01-14 12:21                         ` Michael S. Tsirkin
2014-01-14 15:49                           ` Alex Williamson
2014-01-14 16:07                             ` Michael S. Tsirkin
2014-01-14 17:49                             ` Mike Day
2014-01-14 17:55                               ` Mike Day
2014-01-14 18:05                                 ` Alex Williamson
2014-01-14 18:20                                   ` Mike Day
2014-01-14 13:50                     ` Mike Day
2014-01-14 14:05                       ` Michael S. Tsirkin
2014-01-14 15:01                         ` Mike Day
2014-01-15  0:48                         ` Alexey Kardashevskiy
2014-01-20 16:20     ` Mike Day
2014-01-20 16:45       ` Alex Williamson
2014-01-20 17:04         ` Michael S. Tsirkin
2014-01-20 17:16           ` Alex Williamson
2014-01-20 20:37             ` Michael S. Tsirkin
2013-12-11 18:30 ` [Qemu-devel] [PULL 15/28] exec: reduce L2_PAGE_SIZE Michael S. Tsirkin
2013-12-11 18:30 ` [Qemu-devel] [PULL 16/28] smbios: Set system manufacturer, product & version by default Michael S. Tsirkin
2013-12-11 18:31 ` [Qemu-devel] [PULL 17/28] acpi unit-test: verify signature and checksum Michael S. Tsirkin
2013-12-11 18:31 ` [Qemu-devel] [PULL 18/28] acpi: strip compiler info in built-in DSDT Michael S. Tsirkin
2013-12-11 18:31 ` [Qemu-devel] [PULL 19/28] ACPI DSDT: Make control method `IQCR` serialized Michael S. Tsirkin
2013-12-11 18:31 ` [Qemu-devel] [PULL 20/28] pci: fix pci bridge fw path Michael S. Tsirkin
2013-12-11 18:31 ` [Qemu-devel] [PULL 21/28] hpet: inverse polarity when pin above ISA_NUM_IRQS Michael S. Tsirkin
2013-12-11 18:31 ` [Qemu-devel] [PULL 22/28] hpet: enable to entitle more irq pins for hpet Michael S. Tsirkin
2013-12-11 18:31 ` [Qemu-devel] [PULL 23/28] memory.c: bugfix - ref counting mismatch in memory_region_find Michael S. Tsirkin
2013-12-11 18:31 ` Michael S. Tsirkin [this message]
2013-12-11 18:31 ` [Qemu-devel] [PULL 25/28] acpi unit-test: load and check facs table Michael S. Tsirkin
2013-12-11 18:31 ` [Qemu-devel] [PULL 26/28] acpi unit-test: adjust the test data structure for better handling Michael S. Tsirkin
2013-12-11 18:31 ` [Qemu-devel] [PULL 27/28] hpet: fix build with CONFIG_HPET off Michael S. Tsirkin
2013-12-11 18:31 ` [Qemu-devel] [PULL 28/28] pc: use macro for HPET type Michael S. Tsirkin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1386786509-29966-24-git-send-email-mst@redhat.com \
    --to=mst@redhat.com \
    --cc=marcel.a@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).