linux-perf-users.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Steinar H. Gunderson" <sesse@google.com>
To: acme@kernel.org
Cc: linux-perf-users@vger.kernel.org, linux-kernel@vger.kernel.org,
	 irogers@google.com, "Steinar H. Gunderson" <sesse@google.com>
Subject: [PATCH v7 4/4] perf report: LLVM-based symbol listing
Date: Sun, 26 May 2024 20:22:12 +0200	[thread overview]
Message-ID: <20240526182212.544525-4-sesse@google.com> (raw)
In-Reply-To: <20240526182212.544525-1-sesse@google.com>

Similar to the previous LLVM patches, this allows symbol enumeration
using LLVM instead of libbfd or libelf. This allows distro builds to
reliably enumerate symbols in e.g. PE binaries as profiled with WINE.

A lot of the logic for e.g. length detection has been carried over
from the libbfd code.

Signed-off-by: Steinar H. Gunderson <sesse@google.com>
---
 tools/perf/util/llvm-c-helpers.cpp | 124 +++++++++++++++++++++++++++++
 tools/perf/util/llvm-c-helpers.h   |  49 ++++++++++++
 tools/perf/util/symbol.c           |  54 ++++++++++++-
 tools/perf/util/symbol.h           |   3 +
 4 files changed, 228 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/llvm-c-helpers.cpp b/tools/perf/util/llvm-c-helpers.cpp
index 4070e2d5682f..87caf98cc8f6 100644
--- a/tools/perf/util/llvm-c-helpers.cpp
+++ b/tools/perf/util/llvm-c-helpers.cpp
@@ -9,6 +9,7 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"  /* Needed for LLVM <= 15 */
 #include <llvm/DebugInfo/Symbolize/Symbolize.h>
 #include <llvm/Support/TargetSelect.h>
+#include <llvm/Object/Binary.h>
 #pragma GCC diagnostic pop
 
 #include <stdio.h>
@@ -17,6 +18,8 @@
 extern "C" {
 #include <linux/zalloc.h>
 }
+#include <vector>
+#include <algorithm>
 #include "symbol_conf.h"
 #include "llvm-c-helpers.h"
 
@@ -24,6 +27,7 @@ extern "C"
 char *dso__demangle_sym(struct dso *dso, int kmodule, const char *elf_name);
 
 using namespace llvm;
+using namespace llvm::object;
 using llvm::symbolize::LLVMSymbolizer;
 
 /*
@@ -194,3 +198,123 @@ char *llvm_name_for_data(struct dso *dso, const char *dso_name, u64 addr)
 		dso, res_or_err->Name.c_str(),
 		addr, res_or_err->Start);
 }
+
+int llvm_load_symbols(const char *debugfile, struct llvm_symbol_list *symbols)
+{
+	/* NOTE: This nominally does an mmap, despite the scary name. */
+	ErrorOr<std::unique_ptr<MemoryBuffer>> mem_buf_or_err =
+		MemoryBuffer::getFile(debugfile);
+	if (mem_buf_or_err.getError())
+		return -1;
+
+	Expected<std::unique_ptr<Binary>> binary_or_err(
+		createBinary(mem_buf_or_err.get()->getMemBufferRef(), nullptr));
+	if (!binary_or_err)
+		return -1;
+
+	/* Find the .text section. */
+	SectionRef text_section;
+	uint64_t text_filepos, image_base;
+	for (SectionRef section :
+	     cast<ObjectFile>(*binary_or_err.get()).sections()) {
+		Expected<StringRef> name = section.getName();
+		if (name && *name == ".text") {
+			text_section = section;
+
+			/*
+			 * If we don't find an image base below, we infer the
+			 * image base * from the address and file offset of the
+			 * .text section.
+			 */
+			text_filepos = reinterpret_cast<const char *>(
+				text_section.getContents()->bytes_begin()) -
+				mem_buf_or_err.get()->getBufferStart();
+			image_base = text_section.getAddress() - text_filepos;
+			break;
+		}
+	}
+	if (text_section == SectionRef())
+		/* No .text section, so no symbols (but also not a failure). */
+		return 0;
+
+	/*
+	 * See if we can find an explicit image base pseudosymbol. If so, get
+	 * the image base directly from it, then infer the file position of
+	 * .text from that (i.e., the opposite inference of the fallback above).
+	 */
+	for (SymbolRef symbol :
+	     cast<ObjectFile>(*binary_or_err.get()).symbols())
+		if (symbol.getName() &&
+		    symbol.getAddress() &&
+		    (*symbol.getName() == "__ImageBase" ||
+		     *symbol.getName() == "__image_base__")) {
+			image_base = *symbol.getAddress();
+			if (image_base < 0x100000000ULL)
+				/*
+				 * PE symbols can only have 4 bytes, so use
+				 * .text high bits (if any).
+				 */
+				image_base |= text_section.getAddress() &
+					~0xFFFFFFFFULL;
+			text_filepos = text_section.getAddress() - image_base;
+			break;
+		}
+
+	symbols->image_base = image_base;
+	symbols->text_end = text_filepos + text_section.getSize();
+
+	/* Collect all valid symbols. */
+	std::vector<SymbolRef> all_symbols;
+	for (SymbolRef symbol :
+	     cast<ObjectFile>(*binary_or_err.get()).symbols())
+		if (symbol.getName() && symbol.getFlags() &&
+		    symbol.getAddress() && symbol.getSection())
+			all_symbols.push_back(symbol);
+	symbols->num_symbols = all_symbols.size();
+	symbols->symbols = (struct llvm_symbol *)calloc(
+		all_symbols.size(), sizeof(struct llvm_symbol));
+	if (symbols->symbols == nullptr)
+		return -1;
+
+	/*
+	 * Symbols don't normally come with lengths, so we'll infer them
+	 * from what comes after the symbol address-wise. There is some
+	 * extra logic around zero-length symbols and deduplication,
+	 * which the caller will do for us (it's shared with other backends).
+	 */
+	std::sort(all_symbols.begin(), all_symbols.end(),
+		  [](const SymbolRef &a, const SymbolRef &b) {
+			  if (*a.getAddress() != *b.getAddress())
+				  return *a.getAddress() < *b.getAddress();
+			  return *a.getName() < *b.getName();
+		  });
+	for (size_t i = 0; i < all_symbols.size(); ++i) {
+		const SymbolRef &sym = all_symbols[i];
+		llvm_symbol &out_sym = symbols->symbols[i];
+		out_sym.start = *sym.getAddress() - image_base;
+		out_sym.name = (char *)calloc(1, sym.getName()->size() + 1);
+		if (out_sym.name == nullptr) {
+			for (size_t i = 0; i < all_symbols.size(); ++i) {
+				zfree(&symbols->symbols[i].name);
+			}
+			zfree(&symbols->symbols);
+			return -1;
+		}
+		memcpy(out_sym.name, sym.getName()->bytes_begin(),
+		       sym.getName()->size());
+		out_sym.global = *sym.getFlags() & SymbolRef::SF_Global;
+		out_sym.weak = *sym.getFlags() & SymbolRef::SF_Weak;
+
+		SectionRef section = **sym.getSection();
+		uint64_t next_addr;
+		if (i + 1 < all_symbols.size() &&
+		    section == **all_symbols[i + 1].getSection())
+			next_addr = *all_symbols[i + 1].getAddress();
+		else
+			next_addr = section.getAddress() + section.getSize();
+
+		out_sym.len = next_addr - *sym.getAddress();
+	}
+
+	return 1;
+}
diff --git a/tools/perf/util/llvm-c-helpers.h b/tools/perf/util/llvm-c-helpers.h
index d2b99637a28a..1abbc9ef3dab 100644
--- a/tools/perf/util/llvm-c-helpers.h
+++ b/tools/perf/util/llvm-c-helpers.h
@@ -8,6 +8,7 @@
  */
 
 #include <linux/compiler.h>
+#include <stddef.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -53,6 +54,54 @@ int llvm_addr2line(const char* dso_name,
 char *llvm_name_for_code(struct dso *dso, const char *dso_name, u64 addr);
 char *llvm_name_for_data(struct dso *dso, const char *dso_name, u64 addr);
 
+struct llvm_symbol {
+	u64 start, len;
+	bool weak, global;
+
+	/* Must be freed by the caller. */
+	char *name;
+};
+
+struct llvm_symbol_list {
+	/*
+	 * The address this dso is loaded at in memory; adding this to
+	 * the file position of any symbol will give the address that
+	 * symbol is loaded at.
+	 */
+	u64 image_base;
+
+	/*
+	 * The byte offset of the end of the .text segment (one-past-the-end)
+	 * in the file.
+	 */
+	u64 text_end;
+
+	/*
+	 * The actual symbols. Must be freed by the caller, with free().
+	 * The string inside each llvm_symbol must also be freed.
+	 */
+	size_t num_symbols;
+	struct llvm_symbol *symbols;
+};
+
+/*
+ * Get a list of all symbols in the given file using LLVM. Similarly to
+ * addr2line(), we need to implement this using a helper structure;
+ * LLVM actually has a C API for this, but we can't use it (it doesn't
+ * have an equivalent of getFlags()). So we do the meat of the LLVM-facing
+ * logic here and let the caller make perf symbols out of it.
+ *
+ * Possible return values and their meaning:
+ *
+ * -1: An error occurred.
+ *  0: The file loaded fine, but there were no .text section.
+ *  1: The file loaded successfully.
+ *
+ * For -1 and 0, "symbols" does not contain any meaningful information,
+ * and nothing needs to be freed.
+ */
+int llvm_load_symbols(const char *debugfile, struct llvm_symbol_list *symbols);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 9e5940b5bc59..69b8c0a280c7 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -22,6 +22,7 @@
 #include "util.h" // lsdir()
 #include "debug.h"
 #include "event.h"
+#include "llvm-c-helpers.h"
 #include "machine.h"
 #include "map.h"
 #include "symbol.h"
@@ -1566,6 +1567,50 @@ static int dso__load_perf_map(const char *map_path, struct dso *dso)
 	return -1;
 }
 
+#ifdef HAVE_LIBLLVM_SUPPORT
+int dso__load_llvm_symbols(struct dso *dso, const char *debugfile)
+{
+	struct llvm_symbol_list symbols;
+	struct symbol *symbol;
+	int err = llvm_load_symbols(debugfile, &symbols);
+	int ret = -1;
+
+	if (err == -1)
+		return -1;
+	else if (err == 0)
+		return 0;
+
+	for (size_t i = 0; i < symbols.num_symbols; ++i) {
+		const struct llvm_symbol *sym = &symbols.symbols[i];
+		int elf_binding;
+
+		if (sym->weak)
+			elf_binding = STB_WEAK;
+		else if (sym->global)
+			elf_binding = STB_GLOBAL;
+		else
+			elf_binding = STB_LOCAL;
+		symbol = symbol__new(sym->start, sym->len, elf_binding, STT_FUNC,
+				     sym->name);
+		if (!symbol)
+			goto out_free;
+
+		symbols__insert(dso__symbols(dso), symbol);
+	}
+
+	symbols__fixup_end(dso__symbols(dso), false);
+	symbols__fixup_duplicate(dso__symbols(dso));
+	dso__set_adjust_symbols(dso, true);
+	ret = 0;
+
+out_free:
+	for (size_t i = 0; i < symbols.num_symbols; ++i)
+		zfree(&symbols.symbols[i].name);
+	zfree(&symbols.symbols);
+	return ret;
+}
+#endif
+
 #ifdef HAVE_LIBBFD_SUPPORT
 #define PACKAGE 'perf'
 #include <bfd.h>
@@ -1871,6 +1916,7 @@ int dso__load(struct dso *dso, struct map *map)
 		bool next_slot = false;
 		bool is_reg;
 		bool nsexit;
+		int llvmrc = -1;
 		int bfdrc = -1;
 		int sirc = -1;
 
@@ -1899,11 +1945,15 @@ int dso__load(struct dso *dso, struct map *map)
 			}
 		}
 
-#ifdef HAVE_LIBBFD_SUPPORT
+#ifdef HAVE_LIBLLVM_SUPPORT
 		if (is_reg)
+			llvmrc = dso__load_llvm_symbols(dso, name);
+#endif
+#ifdef HAVE_LIBBFD_SUPPORT
+		if (is_reg && llvmrc < 0)
 			bfdrc = dso__load_bfd_symbols(dso, name);
 #endif
-		if (is_reg && bfdrc < 0)
+		if (is_reg && llvmrc < 0 && bfdrc < 0)
 			sirc = symsrc__init(ss, dso, name, symtab_type);
 
 		if (nsexit)
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 3fb5d146d9b1..01256c41467a 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -174,6 +174,9 @@ int symbol__config_symfs(const struct option *opt __maybe_unused,
 
 struct symsrc;
 
+#ifdef HAVE_LIBLLVM_SUPPORT
+int dso__load_llvm_symbols(struct dso *dso, const char *debugfile);
+#endif
 #ifdef HAVE_LIBBFD_SUPPORT
 int dso__load_bfd_symbols(struct dso *dso, const char *debugfile);
 #endif
-- 
2.45.1


  parent reply	other threads:[~2024-05-26 18:22 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-05-26 18:22 [PATCH v7 1/4] perf report: Support LLVM for addr2line() Steinar H. Gunderson
2024-05-26 18:22 ` [PATCH v7 2/4] perf annotate: split out read_symbol() Steinar H. Gunderson
2024-05-26 18:22 ` [PATCH v7 3/4] perf annotate: LLVM-based disassembler Steinar H. Gunderson
2024-05-29  4:06   ` Namhyung Kim
2024-06-02 20:38     ` Steinar H. Gunderson
2024-05-26 18:22 ` Steinar H. Gunderson [this message]
2024-05-26 21:24   ` [PATCH v7 4/4] perf report: LLVM-based symbol listing Steinar H. Gunderson
2024-05-29  4:11   ` Namhyung Kim
2024-06-02 20:39   ` Steinar H. Gunderson
2024-05-29  3:56 ` [PATCH v7 1/4] perf report: Support LLVM for addr2line() Namhyung Kim
2024-05-29 13:52   ` Arnaldo Carvalho de Melo
2024-05-30 16:55     ` Namhyung Kim
2024-05-30 16:58 ` Namhyung Kim
2024-06-02 20:37   ` Steinar H. Gunderson
2024-06-05  1:13     ` Namhyung Kim

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240526182212.544525-4-sesse@google.com \
    --to=sesse@google.com \
    --cc=acme@kernel.org \
    --cc=irogers@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-perf-users@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).