git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Charles Bailey <charles@hashpling.org>
To: Junio Hamano <gitster@pobox.com>, git@vger.kernel.org
Subject: [PATCH 3/3] Add filter-objects command
Date: Fri, 19 Jun 2015 10:10:59 +0100	[thread overview]
Message-ID: <1434705059-2793-4-git-send-email-charles@hashpling.org> (raw)
In-Reply-To: <1434705059-2793-1-git-send-email-charles@hashpling.org>

From: Charles Bailey <cbailey32@bloomberg.net>

filter-objects is a command to scan all objects in the object database
for the repository and print the ids of those which match the given
criteria.

The current supported criteria are object type and the minimum size of
the object.

The guiding use case is to scan repositories quickly for large objects
which may cause performance issues for users. The list of objects can
then be used to guide some future remediating action.

Signed-off-by: Charles Bailey <cbailey32@bloomberg.net>
---
 Documentation/git-filter-objects.txt | 38 +++++++++++++++++++
 Makefile                             |  1 +
 builtin.h                            |  1 +
 builtin/filter-objects.c             | 73 ++++++++++++++++++++++++++++++++++++
 git.c                                |  1 +
 t/t8100-filter-objects.sh            | 67 +++++++++++++++++++++++++++++++++
 6 files changed, 181 insertions(+)
 create mode 100644 Documentation/git-filter-objects.txt
 create mode 100644 builtin/filter-objects.c
 create mode 100755 t/t8100-filter-objects.sh

diff --git a/Documentation/git-filter-objects.txt b/Documentation/git-filter-objects.txt
new file mode 100644
index 0000000..c10ca01
--- /dev/null
+++ b/Documentation/git-filter-objects.txt
@@ -0,0 +1,38 @@
+git-filter-objects(1)
+=====================
+
+NAME
+----
+git-filter-objects - Scan through all objects in the repository and print those
+matching a given filter
+
+
+SYNOPSIS
+--------
+[verse]
+'git filter-objects' [-t <type> | --type=<type>] [--min-size=<size>]
+	[-v|--verbose]
+
+DESCRIPTION
+-----------
+Scans all objects in a repository - including any unreachable objects - and
+print out the ids of all matching objects.  If `--verbose` is specified then
+the object type and size is printed out as well as its id.
+
+OPTIONS
+-------
+-t::
+--type::
+	Only list objects whose type matches <type>.
+
+--min-size::
+	Only list objects whose size exceeds <size> bytes.
+
+-v::
+--verbose::
+	Output in the followin format instead of just printing object ids:
+	<sha1> SP <type> SP <size>
+
+GIT
+---
+Part of the linkgit:git[1] suite
diff --git a/Makefile b/Makefile
index 149f1c7..a7c017f 100644
--- a/Makefile
+++ b/Makefile
@@ -842,6 +842,7 @@ BUILTIN_OBJS += builtin/diff.o
 BUILTIN_OBJS += builtin/fast-export.o
 BUILTIN_OBJS += builtin/fetch-pack.o
 BUILTIN_OBJS += builtin/fetch.o
+BUILTIN_OBJS += builtin/filter-objects.o
 BUILTIN_OBJS += builtin/fmt-merge-msg.o
 BUILTIN_OBJS += builtin/for-each-ref.o
 BUILTIN_OBJS += builtin/fsck.o
diff --git a/builtin.h b/builtin.h
index b87df70..5a15693 100644
--- a/builtin.h
+++ b/builtin.h
@@ -62,6 +62,7 @@ extern int cmd_diff_tree(int argc, const char **argv, const char *prefix);
 extern int cmd_fast_export(int argc, const char **argv, const char *prefix);
 extern int cmd_fetch(int argc, const char **argv, const char *prefix);
 extern int cmd_fetch_pack(int argc, const char **argv, const char *prefix);
+extern int cmd_filter_objects(int argc, const char **argv, const char *prefix);
 extern int cmd_fmt_merge_msg(int argc, const char **argv, const char *prefix);
 extern int cmd_for_each_ref(int argc, const char **argv, const char *prefix);
 extern int cmd_format_patch(int argc, const char **argv, const char *prefix);
diff --git a/builtin/filter-objects.c b/builtin/filter-objects.c
new file mode 100644
index 0000000..c40d621
--- /dev/null
+++ b/builtin/filter-objects.c
@@ -0,0 +1,73 @@
+#include "cache.h"
+#include "builtin.h"
+#include "revision.h"
+#include "parse-options.h"
+
+#include <stdio.h>
+
+static int req_type;
+static unsigned long min_size;
+static int verbose;
+
+static int check_object(const unsigned char *sha1)
+{
+	unsigned long size;
+	int type = sha1_object_info(sha1, &size);
+
+	if (type < 0)
+		return -1;
+
+	if (size >= min_size && (!req_type || type == req_type)) {
+		if (verbose)
+			printf("%s %s %lu\n", sha1_to_hex(sha1), typename(type), size);
+		else
+			printf("%s\n", sha1_to_hex(sha1));
+	}
+
+	return 0;
+}
+
+static int check_loose_object(const unsigned char *sha1,
+			      const char *path,
+			      void *data)
+{
+	return check_object(sha1);
+}
+
+static int check_packed_object(const unsigned char *sha1,
+			       struct packed_git *pack,
+			       uint32_t pos,
+			       void *data)
+{
+	return check_object(sha1);
+}
+
+static char *opt_type;
+static struct option builtin_filter_objects_options[] = {
+	OPT_ULONG(0, "min-size", &min_size, "minimum size of object to show"),
+	OPT_STRING('t', "type", &opt_type, NULL, "type of objects to show"),
+	OPT__VERBOSE(&verbose, "show object type and size"),
+	OPT_END()
+};
+
+int cmd_filter_objects(int argc, const char **argv, const char *prefix)
+{
+	struct packed_git *p;
+
+	argc = parse_options(argc, argv, prefix, builtin_filter_objects_options,
+			     NULL, 0);
+
+	if (opt_type)
+		req_type = type_from_string(opt_type);
+
+	for_each_loose_object(check_loose_object, NULL, 0);
+
+	prepare_packed_git();
+	for (p = packed_git; p; p = p->next) {
+		open_pack_index(p);
+	}
+
+	for_each_packed_object(check_packed_object, NULL, 0);
+
+	return 0;
+}
diff --git a/git.c b/git.c
index 44374b1..4c87afd 100644
--- a/git.c
+++ b/git.c
@@ -403,6 +403,7 @@ static struct cmd_struct commands[] = {
 	{ "fast-export", cmd_fast_export, RUN_SETUP },
 	{ "fetch", cmd_fetch, RUN_SETUP },
 	{ "fetch-pack", cmd_fetch_pack, RUN_SETUP },
+	{ "filter-objects", cmd_filter_objects, RUN_SETUP },
 	{ "fmt-merge-msg", cmd_fmt_merge_msg, RUN_SETUP },
 	{ "for-each-ref", cmd_for_each_ref, RUN_SETUP },
 	{ "format-patch", cmd_format_patch, RUN_SETUP },
diff --git a/t/t8100-filter-objects.sh b/t/t8100-filter-objects.sh
new file mode 100755
index 0000000..4b0137b
--- /dev/null
+++ b/t/t8100-filter-objects.sh
@@ -0,0 +1,67 @@
+#!/bin/sh
+
+test_description='git filter-objects'
+. ./test-lib.sh
+
+test_expect_success 'setup' '
+	echo hello, world >file &&
+	git add file &&
+	git commit -m "initial"
+'
+
+test_expect_success 'filter by type' '
+	git rev-parse HEAD >expected &&
+	git filter-objects -t commit >result &&
+	test_cmp expected result &&
+	git rev-parse HEAD:file >expected &&
+	git filter-objects -t blob >result &&
+	test_cmp expected result &&
+	git rev-parse HEAD^{tree} >expected &&
+	git filter-objects -t tree >result &&
+	test_cmp expected result
+'
+
+test_expect_success 'filter by type after pack' '
+	git repack -Ad &&
+	git rev-parse HEAD >expected &&
+	git filter-objects -t commit >result &&
+	test_cmp expected result &&
+	git rev-parse HEAD:file >expected &&
+	git filter-objects -t blob >result &&
+	test_cmp expected result &&
+	git rev-parse HEAD^{tree} >expected &&
+	git filter-objects -t tree >result &&
+	test_cmp expected result
+'
+
+test_expect_success 'verbose output' '
+	echo $(git rev-parse HEAD) commit $(git cat-file -s HEAD) >expected &&
+	git filter-objects -v -t commit >result &&
+	test_cmp expected result &&
+	echo $(git rev-parse HEAD:file) blob $(git cat-file -s HEAD:file) >expected &&
+	git filter-objects -v -t blob >result &&
+	test_cmp expected result &&
+	echo $(git rev-parse HEAD^{tree}) tree $(git cat-file -s HEAD^{tree}) >expected &&
+	git filter-objects -v -t tree >result &&
+	test_cmp expected result
+'
+
+test_expect_success 'filter on size' '
+	git commit -F - --allow-empty <<-\EOF &&
+		This is a reasonably long commit message
+
+		It is designed to make sure that we create an object
+		that is substantially larger than all the others.
+
+		Our test file blob is a few bytes, our tree is similarly
+		small and our first commit is not too big.
+
+		This message alone is about 300 characters and a sample
+		commit from it has been measured at 562 bytes.
+	EOF
+	git rev-parse HEAD >expected &&
+	git filter-objects --min-size=500 >result &&
+	test_cmp expected result
+'
+
+test_done
-- 
2.4.0.53.g8440f74

  parent reply	other threads:[~2015-06-19  9:37 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-06-19  9:10 Improvements to parse-options and a new filter-objects command Charles Bailey
2015-06-19  9:10 ` [PATCH 1/3] Correct test-parse-options to handle negative ints Charles Bailey
2015-06-19 18:28   ` Junio C Hamano
2015-06-19  9:10 ` [PATCH 2/3] Move unsigned long option parsing out of pack-objects.c Charles Bailey
2015-06-19 11:03   ` Remi Galan Alfonso
2015-06-19 11:06     ` Charles Bailey
2015-06-19 17:58   ` Junio C Hamano
2015-06-19 18:39     ` Junio C Hamano
2015-06-20 15:31       ` Jakub Narębski
2015-06-19 18:47     ` Jakub Narębski
2015-06-20 16:51     ` Charles Bailey
2015-06-20 17:47       ` Junio C Hamano
2015-06-19  9:10 ` Charles Bailey [this message]
2015-06-19 10:10   ` [PATCH 3/3] Add filter-objects command Jeff King
2015-06-19 10:33     ` Charles Bailey
2015-06-19 10:52       ` Jeff King
2015-06-19 18:28         ` Junio C Hamano
2015-06-19 10:52       ` John Keeping
2015-06-19 11:04         ` Charles Bailey
2015-06-21 18:25 ` Improvements to integer option parsing Charles Bailey
2015-06-21 18:25   ` [PATCH 1/2] Correct test-parse-options to handle negative ints Charles Bailey
2015-06-21 18:25   ` [PATCH 2/2] Move unsigned long option parsing out of pack-objects.c Charles Bailey
2015-06-21 18:30     ` Charles Bailey
2015-06-22 22:03       ` Junio C Hamano
2015-06-22 22:08     ` Junio C Hamano
2015-06-22 22:09   ` Improvements to integer option parsing Junio C Hamano
2015-06-22 22:42     ` Charles Bailey
2015-06-21 19:20 ` Fast enumeration of objects Charles Bailey
2015-06-21 19:20   ` [PATCH] Add list-all-objects command Charles Bailey
2015-06-22  8:38     ` Jeff King
2015-06-22 10:33       ` Jeff King
2015-06-22 10:40         ` [PATCH 1/7] for_each_packed_object: automatically open pack index Jeff King
2015-06-22 10:40         ` [PATCH 2/7] cat-file: minor style fix in options list Jeff King
2015-06-22 10:41         ` [PATCH 3/7] cat-file: move batch_options definition to top of file Jeff King
2015-06-22 10:45         ` [PATCH 4/7] cat-file: add --buffer option Jeff King
2015-06-22 10:45         ` [PATCH 5/7] cat-file: stop returning value from batch_one_object Jeff King
2015-06-22 10:45         ` [PATCH 6/7] cat-file: split batch_one_object into two stages Jeff King
2015-06-22 10:45         ` [PATCH 7/7] cat-file: add --batch-all-objects option Jeff King
2015-06-26  6:56           ` Eric Sunshine
2015-06-26 15:48             ` Jeff King
2015-06-22 11:06         ` [PATCH 8/7] cat-file: sort and de-dup output of --batch-all-objects Jeff King
2015-06-22 22:03           ` Charles Bailey
2015-06-22 23:46             ` Jeff King
2015-06-22 21:48         ` [PATCH] Add list-all-objects command Charles Bailey
2015-06-22 21:50         ` Junio C Hamano
2015-06-22 23:50           ` Jeff King
2015-06-22 11:38       ` Charles Bailey
2015-06-22  9:57     ` Duy Nguyen
2015-06-22 10:24       ` Jeff King
2015-06-22  8:35   ` Fast enumeration of objects Jeff King
2015-06-22 19:44     ` Junio C Hamano

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1434705059-2793-4-git-send-email-charles@hashpling.org \
    --to=charles@hashpling.org \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).