[RFC PATCH] cputlb and ssi: cache class to avoid expensive object_dynamic_cast_assert (HACK!!!)

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: "Alex Bennée" <alex.bennee@linaro.org>
To: qemu-devel@nongnu.org
Cc: "Alex Bennée" <alex.bennee@linaro.org>,
	"Cédric Le Goater" <clg@kaod.org>,
	"Richard Henderson" <richard.henderson@linaro.org>,
	"Paolo Bonzini" <pbonzini@redhat.com>,
	"Alistair Francis" <alistair@alistair23.me>,
	"Eduardo Habkost" <eduardo@habkost.net>,
	"Marcel Apfelbaum" <marcel.apfelbaum@gmail.com>,
	"Philippe Mathieu-Daudé" <f4bug@amsat.org>,
	"Yanan Wang" <wangyanan55@huawei.com>,
	"Daniel P. Berrangé" <berrange@redhat.com>
Subject: [RFC PATCH] cputlb and ssi: cache class to avoid expensive object_dynamic_cast_assert (HACK!!!)
Date: Thu,  4 Aug 2022 10:20:44 +0100	[thread overview]
Message-ID: <20220804092044.2101093-1-alex.bennee@linaro.org> (raw)

Investigating why some BMC models are so slow compared to a plain ARM
virt machines I did some profiling of:

  ./qemu-system-arm -M romulus-bmc -nic user \
    -drive
    file=obmc-phosphor-image-romulus.static.mtd,format=raw,if=mtd \
    -nographic -serial mon:stdio

And saw that object_dynamic_cast was dominating the profile times. We
have a number of cases in the CPU hot path and more importantly for
this model in the SSI bus. As the class is static once the object is
created we just cache it and use it instead of the dynamic case
macros.

[AJB: I suspect a proper fix for this is for QOM to support a cached
class lookup, abortive macro attempt #if 0'd in this patch].

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Cc: Cédric Le Goater <clg@kaod.org>
---
 include/hw/core/cpu.h |  2 ++
 include/hw/ssi/ssi.h  |  3 +++
 include/qom/object.h  | 29 +++++++++++++++++++++++++++++
 accel/tcg/cputlb.c    | 12 ++++++++----
 hw/ssi/ssi.c          | 10 +++++++---
 5 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 500503da13..70027a772e 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -317,6 +317,8 @@ struct qemu_work_item;
 struct CPUState {
     /*< private >*/
     DeviceState parent_obj;
+    /* cache to avoid expensive CPU_GET_CLASS */
+    CPUClass *cc;
     /*< public >*/
 
     int nr_cores;
diff --git a/include/hw/ssi/ssi.h b/include/hw/ssi/ssi.h
index f411858ab0..6950f86810 100644
--- a/include/hw/ssi/ssi.h
+++ b/include/hw/ssi/ssi.h
@@ -59,6 +59,9 @@ struct SSIPeripheralClass {
 struct SSIPeripheral {
     DeviceState parent_obj;
 
+    /* cache the class */
+    SSIPeripheralClass *spc;
+
     /* Chip select state */
     bool cs;
 };
diff --git a/include/qom/object.h b/include/qom/object.h
index ef7258a5e1..2202dbfa43 100644
--- a/include/qom/object.h
+++ b/include/qom/object.h
@@ -198,6 +198,35 @@ struct Object
     OBJ_NAME##_CLASS(const void *klass) \
     { return OBJECT_CLASS_CHECK(ClassType, klass, TYPENAME); }
 
+#if 0
+/**
+ * DECLARE_CACHED_CLASS_CHECKER:
+ * @InstanceType: instance struct name
+ * @ClassType: class struct name
+ * @OBJ_NAME: the object name in uppercase with underscore separators
+ * @TYPENAME: type name
+ *
+ * This variant of DECLARE_CLASS_CHECKERS allows for the caching of
+ * class in the parent object instance. This is useful for very hot
+ * path code at the expense of an extra indirection and check. As per
+ * the original direct usage of this macro should be avoided if the
+ * complete OBJECT_DECLARE_TYPE macro has been used.
+ *
+ * This macro will provide the class type cast functions for a
+ * QOM type.
+ */
+#define DECLARE_CACHED_CLASS_CHECKERS(InstanceType, ClassType, OBJ_NAME, TYPENAME) \
+    DECLARE_CLASS_CHECKERS(ClassType, OBJ_NAME, TYPENAME) \
+    static inline G_GNUC_UNUSED ClassType * \
+    OBJ_NAME##_GET_CACHED_CLASS(const void *obj) \
+    { \
+        InstanceType *p = (InstanceType *) obj; \
+        p->cc = p->cc ? p->cc : OBJECT_GET_CLASS(ClassType, obj, TYPENAME);\
+        return p->cc;                                                   \
+    }
+
+#endif
+
 /**
  * DECLARE_OBJ_CHECKERS:
  * @InstanceType: instance struct name
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index a46f3a654d..882315f7dd 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -1303,8 +1303,9 @@ static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
 static void tlb_fill(CPUState *cpu, target_ulong addr, int size,
                      MMUAccessType access_type, int mmu_idx, uintptr_t retaddr)
 {
-    CPUClass *cc = CPU_GET_CLASS(cpu);
+    CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu);
     bool ok;
+    cpu->cc = cc;
 
     /*
      * This is not a probe, so only valid return is success; failure
@@ -1319,7 +1320,8 @@ static inline void cpu_unaligned_access(CPUState *cpu, vaddr addr,
                                         MMUAccessType access_type,
                                         int mmu_idx, uintptr_t retaddr)
 {
-    CPUClass *cc = CPU_GET_CLASS(cpu);
+    CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu);
+    cpu->cc = cc;
 
     cc->tcg_ops->do_unaligned_access(cpu, addr, access_type, mmu_idx, retaddr);
 }
@@ -1331,7 +1333,8 @@ static inline void cpu_transaction_failed(CPUState *cpu, hwaddr physaddr,
                                           MemTxResult response,
                                           uintptr_t retaddr)
 {
-    CPUClass *cc = CPU_GET_CLASS(cpu);
+    CPUClass *cc = cpu->cc ? cpu->cc : CPU_GET_CLASS(cpu);
+    cpu->cc = cc;
 
     if (!cpu->ignore_memory_transaction_failures &&
         cc->tcg_ops->do_transaction_failed) {
@@ -1606,7 +1609,8 @@ static int probe_access_internal(CPUArchState *env, target_ulong addr,
     if (!tlb_hit_page(tlb_addr, page_addr)) {
         if (!victim_tlb_hit(env, mmu_idx, index, elt_ofs, page_addr)) {
             CPUState *cs = env_cpu(env);
-            CPUClass *cc = CPU_GET_CLASS(cs);
+            CPUClass *cc = cs->cc ? cs->cc : CPU_GET_CLASS(cs);
+            cs->cc = cc;
 
             if (!cc->tcg_ops->tlb_fill(cs, addr, fault_size, access_type,
                                        mmu_idx, nonfault, retaddr)) {
diff --git a/hw/ssi/ssi.c b/hw/ssi/ssi.c
index 003931fb50..f749feb6e3 100644
--- a/hw/ssi/ssi.c
+++ b/hw/ssi/ssi.c
@@ -38,7 +38,8 @@ static void ssi_cs_default(void *opaque, int n, int level)
     bool cs = !!level;
     assert(n == 0);
     if (s->cs != cs) {
-        SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(s);
+        /* SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(s); */
+        SSIPeripheralClass *ssc = s->spc;
         if (ssc->set_cs) {
             ssc->set_cs(s, cs);
         }
@@ -48,7 +49,8 @@ static void ssi_cs_default(void *opaque, int n, int level)
 
 static uint32_t ssi_transfer_raw_default(SSIPeripheral *dev, uint32_t val)
 {
-    SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(dev);
+    /* SSIPeripheralClass *ssc = SSI_PERIPHERAL_GET_CLASS(dev); */
+    SSIPeripheralClass *ssc = dev->spc;
 
     if ((dev->cs && ssc->cs_polarity == SSI_CS_HIGH) ||
             (!dev->cs && ssc->cs_polarity == SSI_CS_LOW) ||
@@ -67,6 +69,7 @@ static void ssi_peripheral_realize(DeviceState *dev, Error **errp)
             ssc->cs_polarity != SSI_CS_NONE) {
         qdev_init_gpio_in_named(dev, ssi_cs_default, SSI_GPIO_CS, 1);
     }
+    s->spc = ssc;
 
     ssc->realize(s, errp);
 }
@@ -120,7 +123,8 @@ uint32_t ssi_transfer(SSIBus *bus, uint32_t val)
 
     QTAILQ_FOREACH(kid, &b->children, sibling) {
         SSIPeripheral *peripheral = SSI_PERIPHERAL(kid->child);
-        ssc = SSI_PERIPHERAL_GET_CLASS(peripheral);
+        /* ssc = SSI_PERIPHERAL_GET_CLASS(peripheral); */
+        ssc = peripheral->spc;
         r |= ssc->transfer_raw(peripheral, val);
     }
 
-- 
2.30.2

next             reply	other threads:[~2022-08-04  9:27 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-08-04  9:20 Alex Bennée [this message]
2022-08-04 14:06 ` [RFC PATCH] cputlb and ssi: cache class to avoid expensive object_dynamic_cast_assert (HACK!!!) Cédric Le Goater
2022-08-04 16:51   ` Alex Bennée
2022-08-04 17:33     ` Cédric Le Goater

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:500503da1 dfblob:70027a772 dfblob:f411858ab dfblob:6950f8681
dfblob:ef7258a5e dfblob:2202dbfa4 dfblob:a46f3a654 dfblob:882315f7d
dfblob:003931fb5 dfblob:f749feb6e )
 OR (
bs:"[RFC PATCH] cputlb and ssi: cache class to avoid expensive object_dynamic_cast_assert (HACK!!!)" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220804092044.2101093-1-alex.bennee@linaro.org \
    --to=alex.bennee@linaro.org \
    --cc=alistair@alistair23.me \
    --cc=berrange@redhat.com \
    --cc=clg@kaod.org \
    --cc=eduardo@habkost.net \
    --cc=f4bug@amsat.org \
    --cc=marcel.apfelbaum@gmail.com \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=richard.henderson@linaro.org \
    --cc=wangyanan55@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).