All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Đoàn Trần Công Danh" <congdanhqx@gmail.com>
To: Taylor Blau <me@ttaylorr.com>
Cc: git@vger.kernel.org
Subject: Re: test-tool: bloom: generate_filter for multiple string?
Date: Wed, 13 Jan 2021 18:59:52 +0700	[thread overview]
Message-ID: <X/7guF05a/Bb/VNp@danh.dev> (raw)
In-Reply-To: <X/3+PG2hi71tj/UA@nand.local>

[-- Attachment #1: Type: text/plain, Size: 855 bytes --]

On 2021-01-12 14:53:32-0500, Taylor Blau <me@ttaylorr.com> wrote:
> On Thu, Dec 31, 2020 at 10:54:38AM +0700, Đoàn Trần Công Danh wrote:
> > I'm reading the code for Bloom Filter to see if arXiv:2012.00472
> > could be an improvement.
> 
> I'm late to the party, but I'm curious to hear which part of this
> article you think would help out the Bloom filter implementation.

Uhm, no. The article doesn't help the Bloom filter implementation.
The article was suggesting using Bloom filter to speed-up the
negotiation in fetch-pack and upload-pack. Which, in my own quick
experience, doesn't help much. Maybe it's me not understand the
article idea or I have made a naive implementation. However, I'm not
convinced to pursued further.

If you are curious, I'm attaching 2 quick-and-low-quality patches with
this email for your consideration.

-- 
Danh

[-- Attachment #2: 0002-fetch-pack-implent-bloom-filter-in-client-side.patch --]
[-- Type: text/x-diff, Size: 4579 bytes --]

From 287d1b94606bbc5475909e5298560cbd28ee998e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C4=90o=C3=A0n=20Tr=E1=BA=A7n=20C=C3=B4ng=20Danh?=
 <congdanhqx@gmail.com>
Date: Thu, 31 Dec 2020 10:58:22 +0700
Subject: [PATCH 2/3] fetch-pack: implent bloom filter in client side
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Đoàn Trần Công Danh <congdanhqx@gmail.com>
---
 fetch-pack.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 81 insertions(+), 4 deletions(-)

diff --git a/fetch-pack.c b/fetch-pack.c
index 876f90c759..f423ccd816 100644
--- a/fetch-pack.c
+++ b/fetch-pack.c
@@ -23,6 +23,9 @@
 #include "fetch-negotiator.h"
 #include "fsck.h"
 #include "shallow.h"
+#include "bloom.h"
+#include "list-objects.h"
+#include "revision.h"
 
 static int transfer_unpack_limit = -1;
 static int fetch_unpack_limit = -1;
@@ -1184,6 +1187,71 @@ static int add_haves(struct fetch_negotiator *negotiator,
 	return ret;
 }
 
+static int rev_info_insert_ref_oid(const char *refname,
+				   const struct object_id *oid,
+				   int flag, void *cbdata)
+{
+	struct rev_info *rev_info = cbdata;
+	add_pending_oid(rev_info, refname, oid, 0);
+	return 1;
+}
+
+static void rev_info_insert_to_oidset(struct commit *commit, void *cbdata)
+{
+	struct oidset *set = cbdata;
+	oidset_insert(set, &commit->object.oid);
+}
+
+static void add_bloom(struct strbuf *req_buf, struct oidset *common)
+{
+	struct oidset_iter iter;
+	struct oidset haves = OIDSET_INIT;
+	struct rev_info rev_info;
+	struct bloom_filter filter;
+	struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
+	struct strbuf buf;
+	const struct object_id *oid;
+	int i;
+
+	if (oidset_size(common) == 0)
+		return;
+	
+	repo_init_revisions(the_repository, &rev_info, NULL);
+
+	oidset_iter_init(common, &iter);
+	while ((oid = oidset_iter_next(&iter))) {
+		add_pending_oid(&rev_info, oid_to_hex(oid), oid,
+			       UNINTERESTING | BOTTOM);
+	}
+
+	for_each_rawref(rev_info_insert_ref_oid, &rev_info);
+	if (prepare_revision_walk(&rev_info)) {
+		warning("couldn't prepare revision walk for fetch");
+		return;
+	}
+	traverse_commit_list(&rev_info, rev_info_insert_to_oidset, NULL, &haves);
+	if (oidset_size(&haves) == 0)
+		return;
+	filter.len = (oidset_size(&haves) * settings.bits_per_entry + BITS_PER_WORD - 1)
+		/ BITS_PER_WORD;
+	free(filter.data);
+	filter.data = xcalloc(filter.len, sizeof(unsigned char));
+	oidset_iter_init(&haves, &iter);
+	while ((oid = oidset_iter_next(&iter))) {
+		struct bloom_key key;
+		fill_bloom_key((const char *)oid->hash, the_hash_algo->rawsz,
+			       &key, &settings);
+		add_key_to_filter(&key, &filter, &settings);
+	}
+
+	strbuf_init(&buf, filter.len * 2 + strlen("bloom \n"));
+	strbuf_addstr(&buf, "bloom ");
+	for (i = 0; i < filter.len; i++)
+		strbuf_addf(&buf, "%02x", filter.data[i]);
+	strbuf_addch(&buf, '\n');
+	packet_buf_write_len(req_buf, buf.buf, buf.len);
+}
+
 static int send_fetch_request(struct fetch_negotiator *negotiator, int fd_out,
 			      struct fetch_pack_args *args,
 			      const struct ref *wants, struct oidset *common,
@@ -1191,6 +1259,7 @@ static int send_fetch_request(struct fetch_negotiator *negotiator, int fd_out,
 			      int sideband_all, int seen_ack)
 {
 	int ret = 0;
+	int transferbloom = 0;
 	const char *hash_name;
 	struct strbuf req_buf = STRBUF_INIT;
 
@@ -1278,6 +1347,11 @@ static int send_fetch_request(struct fetch_negotiator *negotiator, int fd_out,
 	ret = add_haves(negotiator, seen_ack, &req_buf,
 			haves_to_send, in_vain);
 
+	/* Add bloom filter represent commits we have from known common */
+	git_config_get_maybe_bool("transfer.bloom", &transferbloom);
+	if (transferbloom && server_supports_v2("bloom", 0))
+		add_bloom(&req_buf, common);
+
 	/* Send request */
 	packet_buf_flush(&req_buf);
 	if (write_in_full(fd_out, req_buf.buf, req_buf.len) < 0)
@@ -1352,11 +1426,14 @@ static enum common_found process_acks(struct fetch_negotiator *negotiator,
 			struct object_id oid;
 			received_ack = 1;
 			if (!get_oid_hex(arg, &oid)) {
+				struct object *obj;
 				struct commit *commit;
-				oidset_insert(common, &oid);
-				commit = lookup_commit(the_repository, &oid);
-				if (negotiator)
-					negotiator->ack(negotiator, commit);
+				obj = lookup_object(the_repository, &oid);
+				if (obj && (commit = object_as_type(obj, OBJ_COMMIT, 0))) {
+					oidset_insert(common, &oid);
+					if (negotiator)
+						negotiator->ack(negotiator, commit);
+				}
 			}
 			continue;
 		}
-- 
2.30.0.4.gbe3229325d


[-- Attachment #3: 0003-upload-pack-support-bloom-filter.patch --]
[-- Type: text/x-diff, Size: 6093 bytes --]

From 2c93a1a6c37d40ba37eb083a139b6f1312a547e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C4=90o=C3=A0n=20Tr=E1=BA=A7n=20C=C3=B4ng=20Danh?=
 <congdanhqx@gmail.com>
Date: Mon, 4 Jan 2021 20:23:29 +0700
Subject: [PATCH 3/3] upload-pack: support bloom filter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Đoàn Trần Công Danh <congdanhqx@gmail.com>
---
 serve.c              |  1 +
 t/t5701-git-serve.sh |  1 +
 upload-pack.c        | 86 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 88 insertions(+)

diff --git a/serve.c b/serve.c
index eec2fe6f29..daea53bfb7 100644
--- a/serve.c
+++ b/serve.c
@@ -78,6 +78,7 @@ static struct protocol_capability capabilities[] = {
 	{ "server-option", always_advertise, NULL },
 	{ "object-format", object_format_advertise, NULL },
 	{ "session-id", session_id_advertise, NULL },
+	{ "bloom", always_advertise, NULL },
 };
 
 static void advertise_capabilities(void)
diff --git a/t/t5701-git-serve.sh b/t/t5701-git-serve.sh
index a1f5fdc9fd..9fabdba6b1 100755
--- a/t/t5701-git-serve.sh
+++ b/t/t5701-git-serve.sh
@@ -16,6 +16,7 @@ test_expect_success 'test capability advertisement' '
 	fetch=shallow
 	server-option
 	object-format=$(test_oid algo)
+	bloom
 	0000
 	EOF
 
diff --git a/upload-pack.c b/upload-pack.c
index 3b66bf92ba..0eff165865 100644
--- a/upload-pack.c
+++ b/upload-pack.c
@@ -27,6 +27,7 @@
 #include "commit-graph.h"
 #include "commit-reach.h"
 #include "shallow.h"
+#include "bloom.h"
 
 /* Remember to update object flag allocation in object.h */
 #define THEY_HAVE	(1u << 11)
@@ -62,6 +63,8 @@ struct upload_pack_data {
 	struct object_array have_obj;
 	struct oid_array haves;					/* v2 only */
 	struct string_list wanted_refs;				/* v2 only */
+	struct bloom_filter bloom_filter;			/* v2 only */
+	struct bloom_filter_settings bloom_filter_settings;
 
 	struct object_array shallows;
 	struct string_list deepen_not;
@@ -113,6 +116,13 @@ struct upload_pack_data {
 	unsigned advertise_sid : 1;
 };
 
+struct upload_pack_bloom_args
+{
+	struct bloom_filter *bloom_filter;
+	struct bloom_filter_settings *bloom_filter_settings;
+	struct oid_array *acks;
+};
+
 static void upload_pack_data_init(struct upload_pack_data *data)
 {
 	struct string_list symref = STRING_LIST_INIT_DUP;
@@ -125,6 +135,7 @@ static void upload_pack_data_init(struct upload_pack_data *data)
 	struct string_list uri_protocols = STRING_LIST_INIT_DUP;
 	struct object_array extra_edge_obj = OBJECT_ARRAY_INIT;
 	struct string_list allowed_filters = STRING_LIST_INIT_DUP;
+	struct bloom_filter_settings bloom_filter_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
 
 	memset(data, 0, sizeof(*data));
 	data->symref = symref;
@@ -132,6 +143,7 @@ static void upload_pack_data_init(struct upload_pack_data *data)
 	data->want_obj = want_obj;
 	data->have_obj = have_obj;
 	data->haves = haves;
+	data->bloom_filter_settings = bloom_filter_settings;
 	data->shallows = shallows;
 	data->deepen_not = deepen_not;
 	data->uri_protocols = uri_protocols;
@@ -1464,6 +1476,34 @@ static int parse_have(const char *line, struct oid_array *haves)
 	return 0;
 }
 
+static int parse_bloom(const char *line, struct bloom_filter *filter,
+		       struct bloom_filter_settings *settings)
+{
+	const char *arg;
+	unsigned char *p;
+	size_t len;
+	if (!skip_prefix(line, "bloom ", &arg))
+		return 0;
+
+	len = strlen(arg);
+	if (len % 2)
+		die("git upload-pack: corrupted bloom data with len: %zu", len);
+	filter->len = len / 2;
+	filter->data = p = xmalloc(filter->len);
+	while (*arg) {
+		int val = hex2chr(arg);
+		if (val < 0)
+			goto errout;
+		*p++ = val;
+		arg += 2;
+	}
+	return 1;
+errout:
+	filter->len = 0;
+	FREE_AND_NULL(filter->data);
+	return 0;
+}
+
 static void process_args(struct packet_reader *request,
 			 struct upload_pack_data *data)
 {
@@ -1482,6 +1522,9 @@ static void process_args(struct packet_reader *request,
 		if (parse_have(arg, &data->haves))
 			continue;
 
+		if (parse_bloom(arg, &data->bloom_filter, &data->bloom_filter_settings))
+			continue;
+
 		/* process args like thin-pack */
 		if (!strcmp(arg, "thin-pack")) {
 			data->use_thin_pack = 1;
@@ -1570,6 +1613,48 @@ static int process_haves(struct upload_pack_data *data, struct oid_array *common
 	return 0;
 }
 
+static void rev_info_bloom_ack(struct commit *commit, void *cbdata)
+{
+	struct upload_pack_bloom_args *args = cbdata;
+	struct bloom_key key;
+	struct object_id *oid = &commit->object.oid;
+	fill_bloom_key((const char *)oid->hash, the_hash_algo->rawsz,
+		       &key, args->bloom_filter_settings);
+	if (bloom_filter_contains(args->bloom_filter, &key,
+				  args->bloom_filter_settings))
+		oid_array_append(args->acks, oid);
+}
+
+/* Walking from wanted_obj until haves, add all MAYBE objects to acks */
+static int process_bloom(struct upload_pack_data *data, struct oid_array *acks)
+{
+	struct upload_pack_bloom_args args;
+	struct rev_info rev_info;
+	const struct object_id *oid;
+	int i;
+
+	/* unknown bloom data and/or no known common objects */
+	if (!data->bloom_filter.len || !acks->nr)
+		return 0;
+	repo_init_revisions(the_repository, &rev_info, NULL);
+	for (i = 0; i < acks->nr; i++) {
+		oid = &acks->oid[i];
+		add_pending_oid(&rev_info, oid_to_hex(oid), oid,
+				UNINTERESTING | BOTTOM);
+	}
+	for (i = 0; i < data->want_obj.nr; i++) {
+		oid = &data->want_obj.objects[i].item->oid;
+		add_pending_oid(&rev_info, oid_to_hex(oid), oid, 0);
+	}
+	if (prepare_revision_walk(&rev_info))
+		return 0;
+	args.bloom_filter = &data->bloom_filter;
+	args.bloom_filter_settings = &data->bloom_filter_settings;
+	args.acks = acks;
+	traverse_commit_list(&rev_info, rev_info_bloom_ack, NULL, &args);
+	return 0;
+}
+
 static int send_acks(struct upload_pack_data *data, struct oid_array *acks)
 {
 	int i;
@@ -1600,6 +1685,7 @@ static int process_haves_and_send_acks(struct upload_pack_data *data)
 	int ret = 0;
 
 	process_haves(data, &common);
+	process_bloom(data, &common);
 	if (data->done) {
 		ret = 1;
 	} else if (send_acks(data, &common)) {
-- 
2.30.0.4.gbe3229325d


  reply	other threads:[~2021-01-13 12:00 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-12-31  3:54 test-tool: bloom: generate_filter for multiple string? Đoàn Trần Công Danh
2020-12-31 11:31 ` Derrick Stolee
2021-01-05 13:34   ` Đoàn Trần Công Danh
2021-01-12 19:53 ` Taylor Blau
2021-01-13 11:59   ` Đoàn Trần Công Danh [this message]
2021-01-13 12:06     ` Derrick Stolee
2021-01-13 12:13       ` Đoàn Trần Công Danh
2021-01-13 15:13     ` Taylor Blau

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=X/7guF05a/Bb/VNp@danh.dev \
    --to=congdanhqx@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=me@ttaylorr.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.