git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: "Derrick Stolee via GitGitGadget" <gitgitgadget@gmail.com>
To: git@vger.kernel.org
Cc: gitster@pobox.com, johannes.schindelin@gmx.de, peff@peff.net,
	ps@pks.im, me@ttaylorr.com, johncai86@gmail.com,
	newren@gmail.com, Derrick Stolee <stolee@gmail.com>,
	Derrick Stolee <stolee@gmail.com>
Subject: [PATCH 17/30] survey: add report of "largest" paths
Date: Tue, 10 Sep 2024 02:28:42 +0000	[thread overview]
Message-ID: <9e95914d393ff83054ee419b58b9db4d3560a36c.1725935335.git.gitgitgadget@gmail.com> (raw)
In-Reply-To: <pull.1786.git.1725935335.gitgitgadget@gmail.com>

From: Derrick Stolee <stolee@gmail.com>

Since we are already walking our reachable objects using the path-walk API,
let's now collect lists of the paths that contribute most to different
metrics. Specifically, we care about

 * Number of versions.
 * Total size on disk.
 * Total inflated size (no delta or zlib compression).

This information can be critical to discovering which parts of the
repository are causing the most growth, especially on-disk size. Different
packing strategies might help compress data more efficiently, but the toal
inflated size is a representation of the raw size of all snapshots of those
paths. Even when stored efficiently on disk, that size represents how much
information must be processed to complete a command such as 'git blame'.

Since the on-disk size is likely to be fragile, stop testing the exact
output of 'git survey' and check that the correct set of headers is
output.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 builtin/survey.c      | 90 +++++++++++++++++++++++++++++++++++++------
 t/t8100-git-survey.sh | 12 +++++-
 2 files changed, 90 insertions(+), 12 deletions(-)

diff --git a/builtin/survey.c b/builtin/survey.c
index ad467e9a88c..90b041967c8 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -80,7 +80,6 @@ struct survey_report_object_size_summary {
 typedef int (*survey_top_size_cmp)(struct survey_report_object_size_summary *s1,
 				   struct survey_report_object_size_summary *s2);
 
-MAYBE_UNUSED
 static int cmp_by_nr(struct survey_report_object_size_summary *s1,
 		     struct survey_report_object_size_summary *s2)
 {
@@ -91,7 +90,6 @@ static int cmp_by_nr(struct survey_report_object_size_summary *s1,
 	return 0;
 }
 
-MAYBE_UNUSED
 static int cmp_by_disk_size(struct survey_report_object_size_summary *s1,
 			    struct survey_report_object_size_summary *s2)
 {
@@ -102,7 +100,6 @@ static int cmp_by_disk_size(struct survey_report_object_size_summary *s1,
 	return 0;
 }
 
-MAYBE_UNUSED
 static int cmp_by_inflated_size(struct survey_report_object_size_summary *s1,
 				struct survey_report_object_size_summary *s2)
 {
@@ -126,7 +123,6 @@ struct survey_report_top_sizes {
 	size_t alloc;
 };
 
-MAYBE_UNUSED
 static void init_top_sizes(struct survey_report_top_sizes *top,
 			   size_t limit, const char *name,
 			   survey_top_size_cmp cmp)
@@ -146,7 +142,6 @@ static void clear_top_sizes(struct survey_report_top_sizes *top)
 	free(top->data);
 }
 
-MAYBE_UNUSED
 static void maybe_insert_into_top_size(struct survey_report_top_sizes *top,
 				       struct survey_report_object_size_summary *summary)
 {
@@ -182,6 +177,10 @@ struct survey_report {
 	struct survey_report_object_summary reachable_objects;
 
 	struct survey_report_object_size_summary *by_type;
+
+	struct survey_report_top_sizes *top_paths_by_count;
+	struct survey_report_top_sizes *top_paths_by_disk;
+	struct survey_report_top_sizes *top_paths_by_inflate;
 };
 
 #define REPORT_TYPE_COMMIT 0
@@ -423,6 +422,13 @@ static void survey_report_object_sizes(const char *title,
 	clear_table(&table);
 }
 
+static void survey_report_plaintext_sorted_size(
+		struct survey_report_top_sizes *top)
+{
+	survey_report_object_sizes(top->name,  _("Path"),
+				   top->data, top->nr);
+}
+
 static void survey_report_plaintext(struct survey_context *ctx)
 {
 	printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree);
@@ -433,6 +439,21 @@ static void survey_report_plaintext(struct survey_context *ctx)
 				   _("Object Type"),
 				   ctx->report.by_type,
 				   REPORT_TYPE_COUNT);
+
+	survey_report_plaintext_sorted_size(
+		&ctx->report.top_paths_by_count[REPORT_TYPE_TREE]);
+	survey_report_plaintext_sorted_size(
+		&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB]);
+
+	survey_report_plaintext_sorted_size(
+		&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE]);
+	survey_report_plaintext_sorted_size(
+		&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB]);
+
+	survey_report_plaintext_sorted_size(
+		&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE]);
+	survey_report_plaintext_sorted_size(
+		&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB]);
 }
 
 static void survey_report_json(struct survey_context *ctx)
@@ -673,7 +694,8 @@ static void increment_totals(struct survey_context *ctx,
 
 static void increment_object_totals(struct survey_context *ctx,
 				    struct oid_array *oids,
-				    enum object_type type)
+				    enum object_type type,
+				    const char *path)
 {
 	struct survey_report_object_size_summary *total;
 	struct survey_report_object_size_summary summary = { 0 };
@@ -701,6 +723,27 @@ static void increment_object_totals(struct survey_context *ctx,
 	total->disk_size += summary.disk_size;
 	total->inflated_size += summary.inflated_size;
 	total->num_missing += summary.num_missing;
+
+	if (type == OBJ_TREE || type == OBJ_BLOB) {
+		int index = type == OBJ_TREE ?
+			    REPORT_TYPE_TREE : REPORT_TYPE_BLOB;
+		struct survey_report_top_sizes *top;
+
+		/*
+		 * Temporarily store (const char *) here, but it will
+		 * be duped if inserted and will not be freed.
+		 */
+		summary.label = (char *)path;
+
+		top = ctx->report.top_paths_by_count;
+		maybe_insert_into_top_size(&top[index], &summary);
+
+		top = ctx->report.top_paths_by_disk;
+		maybe_insert_into_top_size(&top[index], &summary);
+
+		top = ctx->report.top_paths_by_inflate;
+		maybe_insert_into_top_size(&top[index], &summary);
+	}
 }
 
 static int survey_objects_path_walk_fn(const char *path,
@@ -712,7 +755,7 @@ static int survey_objects_path_walk_fn(const char *path,
 
 	increment_object_counts(&ctx->report.reachable_objects,
 				type, oids->nr);
-	increment_object_totals(ctx, oids, type);
+	increment_object_totals(ctx, oids, type, path);
 
 	ctx->progress_nr += oids->nr;
 	display_progress(ctx->progress, ctx->progress_nr);
@@ -757,6 +800,34 @@ static int iterate_tag_chain(struct survey_context *ctx,
 	return -1;
 }
 
+static void initialize_report(struct survey_context *ctx)
+{
+	const int top_limit = 100;
+
+	CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT);
+	ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits"));
+	ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees"));
+	ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs"));
+
+	CALLOC_ARRAY(ctx->report.top_paths_by_count, REPORT_TYPE_COUNT);
+	init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_TREE],
+		       top_limit, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr);
+	init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB],
+		       top_limit, _("TOP FILES BY COUNT"), cmp_by_nr);
+
+	CALLOC_ARRAY(ctx->report.top_paths_by_disk, REPORT_TYPE_COUNT);
+	init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE],
+		       top_limit, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size);
+	init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB],
+		       top_limit, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size);
+
+	CALLOC_ARRAY(ctx->report.top_paths_by_inflate, REPORT_TYPE_COUNT);
+	init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE],
+		       top_limit, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size);
+	init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB],
+		       top_limit, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size);
+}
+
 static void survey_phase_objects(struct survey_context *ctx)
 {
 	struct rev_info revs = REV_INFO_INIT;
@@ -774,10 +845,7 @@ static void survey_phase_objects(struct survey_context *ctx)
 	info.blobs = 1;
 	info.tags = 1;
 
-	CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT);
-	ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits"));
-	ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees"));
-	ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs"));
+	initialize_report(ctx);
 
 	repo_init_revisions(ctx->repo, &revs, "");
 
diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh
index f8af9601214..c2dab0033f9 100755
--- a/t/t8100-git-survey.sh
+++ b/t/t8100-git-survey.sh
@@ -60,7 +60,17 @@ test_expect_success 'git survey (default)' '
 	      Blobs |    10 |       191 |           101
 	EOF
 
-	test_cmp expect out
+	lines=$(wc -l <expect) &&
+	head -n $lines out >out-trimmed &&
+	test_cmp expect out-trimmed &&
+
+	for type in "DIRECTORIES" "FILES"
+	do
+		for metric in "COUNT" "DISK SIZE" "INFLATED SIZE"
+		do
+			grep "TOP $type BY $metric" out || return 1
+		done || return 1
+	done
 '
 
 test_done
-- 
gitgitgadget


  parent reply	other threads:[~2024-09-10  2:29 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-09-10  2:28 [PATCH 00/30] [RFC] Path-walk API and applications Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 01/30] path-walk: introduce an object walk by path Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 02/30] backfill: add builtin boilerplate Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 03/30] backfill: basic functionality and tests Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 04/30] backfill: add --batch-size=<n> option Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 05/30] backfill: add --sparse option Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 06/30] backfill: assume --sparse when sparse-checkout is enabled Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 07/30] path-walk: allow consumer to specify object types Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 08/30] path-walk: allow visiting tags Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 09/30] survey: stub in new experimental `git-survey` command Jeff Hostetler via GitGitGadget
2024-09-10  2:28 ` [PATCH 10/30] survey: add command line opts to select references Jeff Hostetler via GitGitGadget
2024-09-10  2:28 ` [PATCH 11/30] survey: collect the set of requested refs Jeff Hostetler via GitGitGadget
2024-09-10  2:28 ` [PATCH 12/30] survey: start pretty printing data in table form Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 13/30] survey: add object count summary Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 14/30] survey: summarize total sizes by object type Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 15/30] survey: show progress during object walk Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 16/30] survey: add ability to track prioritized lists Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` Derrick Stolee via GitGitGadget [this message]
2024-09-10  2:28 ` [PATCH 18/30] revision: create mark_trees_uninteresting_dense() Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 19/30] path-walk: add prune_all_uninteresting option Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 20/30] pack-objects: add --path-walk option Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 21/30] pack-objects: extract should_attempt_deltas() Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 22/30] pack-objects: introduce GIT_TEST_PACK_PATH_WALK Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 23/30] p5313: add size comparison test Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 24/30] repack: add --path-walk option Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 25/30] pack-objects: enable --path-walk via config Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 26/30] scalar: enable path-walk during push " Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 27/30] pack-objects: add --full-name-hash option Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 28/30] test-name-hash: add helper to compute name-hash functions Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 29/30] p5314: add a size test for name-hash collisions Derrick Stolee via GitGitGadget
2024-09-10  2:28 ` [PATCH 30/30] pack-objects: output debug info about deltas Derrick Stolee via GitGitGadget
2024-09-11 21:32 ` [PATCH 00/30] [RFC] Path-walk API and applications Junio C Hamano
2024-09-17 10:41 ` Christian Couder
2024-09-18 23:18   ` Derrick Stolee
2024-09-22 18:37     ` Junio C Hamano
2024-09-23  1:22       ` Derrick Stolee
2024-09-23 16:56         ` Junio C Hamano
2024-09-22 21:08 ` Kristoffer Haugsbakk

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=9e95914d393ff83054ee419b58b9db4d3560a36c.1725935335.git.gitgitgadget@gmail.com \
    --to=gitgitgadget@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=johannes.schindelin@gmx.de \
    --cc=johncai86@gmail.com \
    --cc=me@ttaylorr.com \
    --cc=newren@gmail.com \
    --cc=peff@peff.net \
    --cc=ps@pks.im \
    --cc=stolee@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).