* [PATCH 2/2] pack-objects: fix threaded load balancing
@ 2007-12-08 5:03 Nicolas Pitre
2007-12-08 9:18 ` Jeff King
` (3 more replies)
0 siblings, 4 replies; 16+ messages in thread
From: Nicolas Pitre @ 2007-12-08 5:03 UTC (permalink / raw)
To: Junio C Hamano; +Cc: git, Jon Smirl
The current method consists of a master thread serving chunks of objects
to work threads when they're done with their previous chunk. The issue
is to determine the best chunk size: making it too large creates poor
load balancing, while making it too small has a negative effect on pack
size because of the increased number of chunk boundaries and poor delta
window utilization.
This patch implements a completely different approach by initially
splitting the work in large chunks uniformly amongst all threads, and
whenever a thread is done then it steals half of the remaining work from
another thread with the largest amount of unprocessed objects.
This has the advantage of greatly reducing the number of chunk boundaries
with an almost perfect load balancing.
Signed-off-by: Nicolas Pitre <nico@cam.org>
---
builtin-pack-objects.c | 117 +++++++++++++++++++++++++++++++++++-------------
1 files changed, 85 insertions(+), 32 deletions(-)
diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index 5002cc6..fcc1901 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -1479,10 +1479,10 @@ static unsigned long free_unpacked(struct unpacked *n)
return freed_mem;
}
-static void find_deltas(struct object_entry **list, unsigned list_size,
+static void find_deltas(struct object_entry **list, unsigned *list_size,
int window, int depth, unsigned *processed)
{
- uint32_t i = 0, idx = 0, count = 0;
+ uint32_t i, idx = 0, count = 0;
unsigned int array_size = window * sizeof(struct unpacked);
struct unpacked *array;
unsigned long mem_usage = 0;
@@ -1490,11 +1490,23 @@ static void find_deltas(struct object_entry **list, unsigned list_size,
array = xmalloc(array_size);
memset(array, 0, array_size);
- do {
- struct object_entry *entry = list[i++];
+ for (;;) {
+ struct object_entry *entry = *list++;
struct unpacked *n = array + idx;
int j, max_depth, best_base = -1;
+ progress_lock();
+ if (!*list_size) {
+ progress_unlock();
+ break;
+ }
+ (*list_size)--;
+ if (!entry->preferred_base) {
+ (*processed)++;
+ display_progress(progress_state, *processed);
+ }
+ progress_unlock();
+
mem_usage -= free_unpacked(n);
n->entry = entry;
@@ -1512,11 +1524,6 @@ static void find_deltas(struct object_entry **list, unsigned list_size,
if (entry->preferred_base)
goto next;
- progress_lock();
- (*processed)++;
- display_progress(progress_state, *processed);
- progress_unlock();
-
/*
* If the current object is at pack edge, take the depth the
* objects that depend on the current object into account
@@ -1576,7 +1583,7 @@ static void find_deltas(struct object_entry **list, unsigned list_size,
count++;
if (idx >= window)
idx = 0;
- } while (i < list_size);
+ }
for (i = 0; i < window; ++i) {
free_delta_index(array[i].index);
@@ -1591,6 +1598,7 @@ struct thread_params {
pthread_t thread;
struct object_entry **list;
unsigned list_size;
+ unsigned remaining;
int window;
int depth;
unsigned *processed;
@@ -1612,10 +1620,10 @@ static void *threaded_find_deltas(void *arg)
pthread_mutex_lock(&data_ready);
pthread_mutex_unlock(&data_request);
- if (!me->list_size)
+ if (!me->remaining)
return NULL;
- find_deltas(me->list, me->list_size,
+ find_deltas(me->list, &me->remaining,
me->window, me->depth, me->processed);
}
}
@@ -1624,57 +1632,102 @@ static void ll_find_deltas(struct object_entry **list, unsigned list_size,
int window, int depth, unsigned *processed)
{
struct thread_params *target, p[delta_search_threads];
- int i, ret;
- unsigned chunk_size;
+ int i, ret, active_threads = 0;
if (delta_search_threads <= 1) {
- find_deltas(list, list_size, window, depth, processed);
+ find_deltas(list, &list_size, window, depth, processed);
return;
}
pthread_mutex_lock(&data_provider);
pthread_mutex_lock(&data_ready);
+ /* Start work threads. */
for (i = 0; i < delta_search_threads; i++) {
p[i].window = window;
p[i].depth = depth;
p[i].processed = processed;
+ p[i].remaining = 0;
ret = pthread_create(&p[i].thread, NULL,
threaded_find_deltas, &p[i]);
if (ret)
die("unable to create thread: %s", strerror(ret));
+ active_threads++;
}
- /* this should be auto-tuned somehow */
- chunk_size = window * 1000;
+ /* Then partition the work amongst them. */
+ for (i = 0; i < delta_search_threads; i++) {
+ unsigned sub_size = list_size / (delta_search_threads - i);
- do {
- unsigned sublist_size = chunk_size;
- if (sublist_size > list_size)
- sublist_size = list_size;
+ pthread_mutex_lock(&data_provider);
+ target = data_requester;
+ if (!sub_size) {
+ pthread_mutex_unlock(&data_ready);
+ pthread_join(target->thread, NULL);
+ active_threads--;
+ continue;
+ }
/* try to split chunks on "path" boundaries */
- while (sublist_size < list_size && list[sublist_size]->hash &&
- list[sublist_size]->hash == list[sublist_size-1]->hash)
- sublist_size++;
+ while (sub_size < list_size && list[sub_size]->hash &&
+ list[sub_size]->hash == list[sub_size-1]->hash)
+ sub_size++;
+
+ target->list = list;
+ target->list_size = sub_size;
+ target->remaining = sub_size;
+ pthread_mutex_unlock(&data_ready);
+ list += sub_size;
+ list_size -= sub_size;
+ }
+
+ /*
+ * Now let's wait for work completion. Each time a thread is done
+ * with its work, we steal half of the remaining work from the
+ * thread with the largest number of unprocessed objects and give
+ * it to that newly idle thread. This ensure good load balancing
+ * until the remaining object list segments are simply too short
+ * to be worth splitting anymore.
+ */
+ do {
+ struct thread_params *victim = NULL;
+ unsigned sub_size = 0;
pthread_mutex_lock(&data_provider);
target = data_requester;
- target->list = list;
- target->list_size = sublist_size;
+
+ progress_lock();
+ for (i = 0; i < delta_search_threads; i++)
+ if (p[i].remaining > 2*window &&
+ (!victim || victim->remaining < p[i].remaining))
+ victim = &p[i];
+ if (victim) {
+ sub_size = victim->remaining / 2;
+ list = victim->list + victim->list_size - sub_size;
+ while (sub_size && list[0]->hash &&
+ list[0]->hash == list[-1]->hash) {
+ list++;
+ sub_size--;
+ }
+ target->list = list;
+ victim->list_size -= sub_size;
+ victim->remaining -= sub_size;
+ }
+ progress_unlock();
+
+ target->list_size = sub_size;
+ target->remaining = sub_size;
pthread_mutex_unlock(&data_ready);
- list += sublist_size;
- list_size -= sublist_size;
- if (!sublist_size) {
+ if (!sub_size) {
pthread_join(target->thread, NULL);
- i--;
+ active_threads--;
}
- } while (i);
+ } while (active_threads);
}
#else
-#define ll_find_deltas find_deltas
+#define ll_find_deltas(l, s, w, d, p) find_deltas(l, &s, w, d, p)
#endif
static void prepare_pack(int window, int depth)
--
1.5.3.7.2184.ge321d-dirty
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
2007-12-08 5:03 [PATCH 2/2] pack-objects: fix threaded load balancing Nicolas Pitre
@ 2007-12-08 9:18 ` Jeff King
2007-12-10 4:10 ` Jon Smirl
` (2 subsequent siblings)
3 siblings, 0 replies; 16+ messages in thread
From: Jeff King @ 2007-12-08 9:18 UTC (permalink / raw)
To: Nicolas Pitre; +Cc: Junio C Hamano, git, Jon Smirl
On Sat, Dec 08, 2007 at 12:03:17AM -0500, Nicolas Pitre wrote:
> This patch implements a completely different approach by initially
> splitting the work in large chunks uniformly amongst all threads, and
> whenever a thread is done then it steals half of the remaining work from
> another thread with the largest amount of unprocessed objects.
Yay! This works great on my "pack a few hundred very large objects" repo.
-Peff
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
2007-12-08 5:03 [PATCH 2/2] pack-objects: fix threaded load balancing Nicolas Pitre
2007-12-08 9:18 ` Jeff King
@ 2007-12-10 4:10 ` Jon Smirl
2007-12-10 4:30 ` Jon Smirl
2007-12-11 17:02 ` [PATCH 2/2] pack-objects: fix threaded load balancing Johannes Sixt
3 siblings, 0 replies; 16+ messages in thread
From: Jon Smirl @ 2007-12-10 4:10 UTC (permalink / raw)
To: Nicolas Pitre; +Cc: Junio C Hamano, git
I added a couple of printfs to debug this.
Why did the thread exit when there were still 72,000 objects left?
Another one exited with 50,000 objects left.
The repack is still running, it has been proceeding on two cores for
10 minutes after the two threads exited.
jonsmirl@terra:/video/gcc$ git-repack -a -d -f --depth=250 --window=250
Counting objects: 828348, done.
starting thread 0
Compressing objects: 0% (1/809010)
starting thread 1
starting thread 2
starting thread 3
Compressing objects: 59% (478011/809010)
victim 0x7fffc7976480 sub_size 76058
thread 0x7fffc79764f8 sub_size 76058
Compressing objects: 62% (504273/809010)
victim 0x7fffc79764a8 sub_size 69967
thread 0x7fffc79764d0 sub_size 69967
Compressing objects: 82% (664231/809010)
victim 0x7fffc7976480 sub_size 35308
thread 0x7fffc79764d0 sub_size 35308
Compressing objects: 91% (736690/809010)
victim 0x7fffc79764d0 sub_size 0
thread 0x7fffc79764f8 sub_size 0
Compressing objects: 93% (758922/809010)
victim 0x7fffc79764d0 sub_size 0
thread 0x7fffc79764a8 sub_size 0
--
Jon Smirl
jonsmirl@gmail.com
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
2007-12-08 5:03 [PATCH 2/2] pack-objects: fix threaded load balancing Nicolas Pitre
2007-12-08 9:18 ` Jeff King
2007-12-10 4:10 ` Jon Smirl
@ 2007-12-10 4:30 ` Jon Smirl
2007-12-10 5:23 ` Jon Smirl
2007-12-11 17:02 ` [PATCH 2/2] pack-objects: fix threaded load balancing Johannes Sixt
3 siblings, 1 reply; 16+ messages in thread
From: Jon Smirl @ 2007-12-10 4:30 UTC (permalink / raw)
To: Nicolas Pitre; +Cc: Junio C Hamano, git
On 12/8/07, Nicolas Pitre <nico@cam.org> wrote:
>
> The current method consists of a master thread serving chunks of objects
> to work threads when they're done with their previous chunk. The issue
> is to determine the best chunk size: making it too large creates poor
> load balancing, while making it too small has a negative effect on pack
> size because of the increased number of chunk boundaries and poor delta
> window utilization.
>
> This patch implements a completely different approach by initially
> splitting the work in large chunks uniformly amongst all threads, and
> whenever a thread is done then it steals half of the remaining work from
> another thread with the largest amount of unprocessed objects.
>
> This has the advantage of greatly reducing the number of chunk boundaries
> with an almost perfect load balancing.
>
> Signed-off-by: Nicolas Pitre <nico@cam.org>
> ---
> builtin-pack-objects.c | 117 +++++++++++++++++++++++++++++++++++-------------
> 1 files changed, 85 insertions(+), 32 deletions(-)
>
> diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
> index 5002cc6..fcc1901 100644
> --- a/builtin-pack-objects.c
> +++ b/builtin-pack-objects.c
> @@ -1479,10 +1479,10 @@ static unsigned long free_unpacked(struct unpacked *n)
> return freed_mem;
> }
>
> -static void find_deltas(struct object_entry **list, unsigned list_size,
> +static void find_deltas(struct object_entry **list, unsigned *list_size,
> int window, int depth, unsigned *processed)
> {
> - uint32_t i = 0, idx = 0, count = 0;
> + uint32_t i, idx = 0, count = 0;
> unsigned int array_size = window * sizeof(struct unpacked);
> struct unpacked *array;
> unsigned long mem_usage = 0;
> @@ -1490,11 +1490,23 @@ static void find_deltas(struct object_entry **list, unsigned list_size,
> array = xmalloc(array_size);
> memset(array, 0, array_size);
>
> - do {
> - struct object_entry *entry = list[i++];
> + for (;;) {
> + struct object_entry *entry = *list++;
> struct unpacked *n = array + idx;
> int j, max_depth, best_base = -1;
>
> + progress_lock();
> + if (!*list_size) {
> + progress_unlock();
> + break;
> + }
> + (*list_size)--;
> + if (!entry->preferred_base) {
> + (*processed)++;
> + display_progress(progress_state, *processed);
> + }
> + progress_unlock();
> +
> mem_usage -= free_unpacked(n);
> n->entry = entry;
>
> @@ -1512,11 +1524,6 @@ static void find_deltas(struct object_entry **list, unsigned list_size,
> if (entry->preferred_base)
> goto next;
>
> - progress_lock();
> - (*processed)++;
> - display_progress(progress_state, *processed);
> - progress_unlock();
> -
> /*
> * If the current object is at pack edge, take the depth the
> * objects that depend on the current object into account
> @@ -1576,7 +1583,7 @@ static void find_deltas(struct object_entry **list, unsigned list_size,
> count++;
> if (idx >= window)
> idx = 0;
> - } while (i < list_size);
> + }
>
> for (i = 0; i < window; ++i) {
> free_delta_index(array[i].index);
> @@ -1591,6 +1598,7 @@ struct thread_params {
> pthread_t thread;
> struct object_entry **list;
> unsigned list_size;
> + unsigned remaining;
> int window;
> int depth;
> unsigned *processed;
> @@ -1612,10 +1620,10 @@ static void *threaded_find_deltas(void *arg)
> pthread_mutex_lock(&data_ready);
> pthread_mutex_unlock(&data_request);
>
> - if (!me->list_size)
> + if (!me->remaining)
> return NULL;
>
> - find_deltas(me->list, me->list_size,
> + find_deltas(me->list, &me->remaining,
> me->window, me->depth, me->processed);
> }
> }
> @@ -1624,57 +1632,102 @@ static void ll_find_deltas(struct object_entry **list, unsigned list_size,
> int window, int depth, unsigned *processed)
> {
> struct thread_params *target, p[delta_search_threads];
> - int i, ret;
> - unsigned chunk_size;
> + int i, ret, active_threads = 0;
>
> if (delta_search_threads <= 1) {
> - find_deltas(list, list_size, window, depth, processed);
> + find_deltas(list, &list_size, window, depth, processed);
> return;
> }
>
> pthread_mutex_lock(&data_provider);
> pthread_mutex_lock(&data_ready);
>
> + /* Start work threads. */
> for (i = 0; i < delta_search_threads; i++) {
> p[i].window = window;
> p[i].depth = depth;
> p[i].processed = processed;
> + p[i].remaining = 0;
> ret = pthread_create(&p[i].thread, NULL,
> threaded_find_deltas, &p[i]);
> if (ret)
> die("unable to create thread: %s", strerror(ret));
> + active_threads++;
> }
>
> - /* this should be auto-tuned somehow */
> - chunk_size = window * 1000;
> + /* Then partition the work amongst them. */
> + for (i = 0; i < delta_search_threads; i++) {
> + unsigned sub_size = list_size / (delta_search_threads - i);
>
> - do {
> - unsigned sublist_size = chunk_size;
> - if (sublist_size > list_size)
> - sublist_size = list_size;
> + pthread_mutex_lock(&data_provider);
> + target = data_requester;
> + if (!sub_size) {
> + pthread_mutex_unlock(&data_ready);
> + pthread_join(target->thread, NULL);
> + active_threads--;
> + continue;
> + }
>
> /* try to split chunks on "path" boundaries */
> - while (sublist_size < list_size && list[sublist_size]->hash &&
> - list[sublist_size]->hash == list[sublist_size-1]->hash)
> - sublist_size++;
> + while (sub_size < list_size && list[sub_size]->hash &&
> + list[sub_size]->hash == list[sub_size-1]->hash)
> + sub_size++;
> +
> + target->list = list;
> + target->list_size = sub_size;
> + target->remaining = sub_size;
> + pthread_mutex_unlock(&data_ready);
>
> + list += sub_size;
> + list_size -= sub_size;
> + }
> +
> + /*
> + * Now let's wait for work completion. Each time a thread is done
> + * with its work, we steal half of the remaining work from the
> + * thread with the largest number of unprocessed objects and give
> + * it to that newly idle thread. This ensure good load balancing
> + * until the remaining object list segments are simply too short
> + * to be worth splitting anymore.
> + */
> + do {
> + struct thread_params *victim = NULL;
> + unsigned sub_size = 0;
> pthread_mutex_lock(&data_provider);
> target = data_requester;
> - target->list = list;
> - target->list_size = sublist_size;
> +
> + progress_lock();
> + for (i = 0; i < delta_search_threads; i++)
> + if (p[i].remaining > 2*window &&
> + (!victim || victim->remaining < p[i].remaining))
> + victim = &p[i];
> + if (victim) {
> + sub_size = victim->remaining / 2;
> + list = victim->list + victim->list_size - sub_size;
> + while (sub_size && list[0]->hash &&
> + list[0]->hash == list[-1]->hash) {
> + list++;
I think you needed to copy sub_size to another variable for this loop
> + sub_size--;
> + }
> + target->list = list;
> + victim->list_size -= sub_size;
> + victim->remaining -= sub_size;
> + }
> + progress_unlock();
> +
> + target->list_size = sub_size;
> + target->remaining = sub_size;
> pthread_mutex_unlock(&data_ready);
>
> - list += sublist_size;
> - list_size -= sublist_size;
> - if (!sublist_size) {
> + if (!sub_size) {
> pthread_join(target->thread, NULL);
> - i--;
> + active_threads--;
> }
> - } while (i);
> + } while (active_threads);
> }
>
> #else
> -#define ll_find_deltas find_deltas
> +#define ll_find_deltas(l, s, w, d, p) find_deltas(l, &s, w, d, p)
> #endif
>
> static void prepare_pack(int window, int depth)
> --
> 1.5.3.7.2184.ge321d-dirty
>
>
--
Jon Smirl
jonsmirl@gmail.com
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
2007-12-10 4:30 ` Jon Smirl
@ 2007-12-10 5:23 ` Jon Smirl
2007-12-10 5:59 ` Jon Smirl
0 siblings, 1 reply; 16+ messages in thread
From: Jon Smirl @ 2007-12-10 5:23 UTC (permalink / raw)
To: Nicolas Pitre; +Cc: Junio C Hamano, git
On 12/9/07, Jon Smirl <jonsmirl@gmail.com> wrote:
> > + if (victim) {
> > + sub_size = victim->remaining / 2;
> > + list = victim->list + victim->list_size - sub_size;
> > + while (sub_size && list[0]->hash &&
> > + list[0]->hash == list[-1]->hash) {
> > + list++;
>
> I think you needed to copy sub_size to another variable for this loop
Copying sub_size was wrong. I believe you are checking for deltas on
the same file. It's probably that chain of 103,817 deltas that can't
be broken up.
ChainLength Objects Cumulative
1: 103817 103817
2: 67332 171149
3: 57520 228669
4: 52570 281239
5: 43910 325149
6: 37520 362669
7: 35248 397917
8: 29819 427736
9: 27619 455355
10: 22656 478011
11: 21073 499084
...
>
> > + sub_size--;
> > + }
> > + target->list = list;
> > + victim->list_size -= sub_size;
> > + victim->remaining -= sub_size;
> > + }
--
Jon Smirl
jonsmirl@gmail.com
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
2007-12-10 5:23 ` Jon Smirl
@ 2007-12-10 5:59 ` Jon Smirl
2007-12-10 6:06 ` Jon Smirl
0 siblings, 1 reply; 16+ messages in thread
From: Jon Smirl @ 2007-12-10 5:59 UTC (permalink / raw)
To: Nicolas Pitre; +Cc: Junio C Hamano, git
I just deleted the section looking for identical hashes.
+ while (sub_size && list[0]->hash &&
+ list[0]->hash == list[-1]->hash) {
+ list++;
+ sub_size--;
+ }
Doing that allows the long chains to be split over the cores.
My last 5% of objects is taking over 50% of the total CPU time in the
repack. I think these objects are the ones from that 103,817 entry
chain. It is also causing the explosion in RAM consumption.
At the end I can only do 20 objects per clock second on four cores. It
takes 30 clock minutes (120 CPU minutes) to do the last 3% of objects.
Can the chains be limited to not grow over some reasonable number, say
5,000? It will make the pack a little bigger but it will help a lot
with performance.
--
Jon Smirl
jonsmirl@gmail.com
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
2007-12-10 5:59 ` Jon Smirl
@ 2007-12-10 6:06 ` Jon Smirl
2007-12-10 6:19 ` Jon Smirl
2007-12-10 16:14 ` Nicolas Pitre
0 siblings, 2 replies; 16+ messages in thread
From: Jon Smirl @ 2007-12-10 6:06 UTC (permalink / raw)
To: Nicolas Pitre; +Cc: Junio C Hamano, git
On 12/10/07, Jon Smirl <jonsmirl@gmail.com> wrote:
> I just deleted the section looking for identical hashes.
>
> + while (sub_size && list[0]->hash &&
> + list[0]->hash == list[-1]->hash) {
> + list++;
> + sub_size--;
> + }
>
> Doing that allows the long chains to be split over the cores.
>
> My last 5% of objects is taking over 50% of the total CPU time in the
> repack. I think these objects are the ones from that 103,817 entry
> chain. It is also causing the explosion in RAM consumption.
>
> At the end I can only do 20 objects per clock second on four cores. It
> takes 30 clock minutes (120 CPU minutes) to do the last 3% of objects.
It's all in create_delta...
samples % symbol name
10344074 98.5961 create_delta
138010 1.3155 create_delta_index
4380 0.0417 find_deltas
2526 0.0241 patch_delta
776 0.0074 unpack_entry
>
> Can the chains be limited to not grow over some reasonable number, say
> 5,000? It will make the pack a little bigger but it will help a lot
> with performance.
>
> --
> Jon Smirl
> jonsmirl@gmail.com
>
--
Jon Smirl
jonsmirl@gmail.com
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
2007-12-10 6:06 ` Jon Smirl
@ 2007-12-10 6:19 ` Jon Smirl
2007-12-10 16:03 ` Nicolas Pitre
2007-12-10 16:14 ` Nicolas Pitre
1 sibling, 1 reply; 16+ messages in thread
From: Jon Smirl @ 2007-12-10 6:19 UTC (permalink / raw)
To: Nicolas Pitre; +Cc: Junio C Hamano, git
It seems to be caused by looping over those runs of 100,000 identical
hash entries.
5091699 24.6407 : for (entry = index->hash[i];
entry < index->hash[i+1]; entry++) {
5301791 25.6575 : const unsigned char
*ref = entry->ptr;
228772 1.1071 : while (data < top) {
223646 1.0823 : if (msize < 4096) {
: struct index_entry *entry;
5862 0.0284 : val ^= U[data[-RABIN_WINDOW]];
753004 3.6441 : val = ((val << 8) | *data) ^
T[val >> RABIN_SHIFT];
1232556 5.9648 : i = val & index->hash_mask;
5091699 24.6407 : for (entry = index->hash[i];
entry < index->hash[i+1]; entry++) {
5301791 25.6575 : const unsigned char
*ref = entry->ptr;
: const unsigned char *src = data;
25919 0.1254 : unsigned int ref_size
= ref_top - ref;
740077 3.5815 : if (entry->val != val)
: continue;
331 0.0016 : if (ref_size > top - src)
83804 0.4056 : ref_size = top - src;
25059 0.1213 : if (ref_size <= msize)
: break;
1269621 6.1442 : while (ref_size-- &&
*src++ == *ref)
42122 0.2038 : ref++;
14362 0.0695 : if (msize < ref - entry->ptr) {
: /* this is our
best match so far */
10452 0.0506 : msize = ref -
entry->ptr;
6882 0.0333 : moff =
entry->ptr - ref_data;
6382 0.0309 : if (msize >=
4096) /* good enough */
: break;
: }
: }
--
Jon Smirl
jonsmirl@gmail.com
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
2007-12-10 6:19 ` Jon Smirl
@ 2007-12-10 16:03 ` Nicolas Pitre
0 siblings, 0 replies; 16+ messages in thread
From: Nicolas Pitre @ 2007-12-10 16:03 UTC (permalink / raw)
To: Jon Smirl; +Cc: Junio C Hamano, git
On Mon, 10 Dec 2007, Jon Smirl wrote:
> It seems to be caused by looping over those runs of 100,000 identical
> hash entries.
As mentioned earlier, there simply can't be 100000 identical hash
entries.
> 5091699 24.6407 : for (entry = index->hash[i]; entry < index->hash[i+1]; entry++) {
> 5301791 25.6575 : const unsigned char *ref = entry->ptr;
This loop is the very heart of the delta search code. It is normal to
see it high on the graph. In some worst cases, this loop might even be
entered for every byte in the source buffer.
And yet I bring you back to your observation about the fact that the
source pack used for the repack (whether it is the 2.1GB pack or the
300MB pack) has a great influence on the performance outcome. The code
above, though, would be executed the very exact number of times, and the
exact amount of memory allocated, regardless of the source pack used (as
long as you use -f with 'git repack' of course).
So this cannot be the culprit.
Nicolas
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
2007-12-10 6:06 ` Jon Smirl
2007-12-10 6:19 ` Jon Smirl
@ 2007-12-10 16:14 ` Nicolas Pitre
2007-12-10 17:06 ` Jon Smirl
1 sibling, 1 reply; 16+ messages in thread
From: Nicolas Pitre @ 2007-12-10 16:14 UTC (permalink / raw)
To: Jon Smirl; +Cc: Junio C Hamano, git
On Mon, 10 Dec 2007, Jon Smirl wrote:
> On 12/10/07, Jon Smirl <jonsmirl@gmail.com> wrote:
> > I just deleted the section looking for identical hashes.
> >
> > + while (sub_size && list[0]->hash &&
> > + list[0]->hash == list[-1]->hash) {
> > + list++;
> > + sub_size--;
> > + }
> >
> > Doing that allows the long chains to be split over the cores.
> >
> > My last 5% of objects is taking over 50% of the total CPU time in the
> > repack. I think these objects are the ones from that 103,817 entry
> > chain. It is also causing the explosion in RAM consumption.
> >
> > At the end I can only do 20 objects per clock second on four cores. It
> > takes 30 clock minutes (120 CPU minutes) to do the last 3% of objects.
>
> It's all in create_delta...
Here you're mixing two different hashes with no relation what so ever
with each other.
The hash in create_delta corresponds to chunk of data in a reference
buffer that we try to match in a source buffer.
The hash in the code above has to do with the file names the
corresponding objects are coming from.
And again, both hash uses are deterministic i.e. they will be the same
when repacking with -f regardless if the source pack is the 2.1GB or the
300MB one, so they may not explain the huge performance and memory usage
discrepency you see between those two packs.
The code that do get influenced by the source pack, though, is all
concentrated in sha1_file.c.
Nicolas
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
2007-12-10 16:14 ` Nicolas Pitre
@ 2007-12-10 17:06 ` Jon Smirl
2007-12-10 18:21 ` Nicolas Pitre
0 siblings, 1 reply; 16+ messages in thread
From: Jon Smirl @ 2007-12-10 17:06 UTC (permalink / raw)
To: Nicolas Pitre; +Cc: Junio C Hamano, git
On 12/10/07, Nicolas Pitre <nico@cam.org> wrote:
> On Mon, 10 Dec 2007, Jon Smirl wrote:
>
> > On 12/10/07, Jon Smirl <jonsmirl@gmail.com> wrote:
> > > I just deleted the section looking for identical hashes.
> > >
> > > + while (sub_size && list[0]->hash &&
> > > + list[0]->hash == list[-1]->hash) {
> > > + list++;
> > > + sub_size--;
> > > + }
> > >
> > > Doing that allows the long chains to be split over the cores.
> > >
> > > My last 5% of objects is taking over 50% of the total CPU time in the
> > > repack. I think these objects are the ones from that 103,817 entry
> > > chain. It is also causing the explosion in RAM consumption.
> > >
> > > At the end I can only do 20 objects per clock second on four cores. It
> > > takes 30 clock minutes (120 CPU minutes) to do the last 3% of objects.
> >
> > It's all in create_delta...
>
> Here you're mixing two different hashes with no relation what so ever
> with each other.
>
> The hash in create_delta corresponds to chunk of data in a reference
> buffer that we try to match in a source buffer.
>
> The hash in the code above has to do with the file names the
> corresponding objects are coming from.
So can we change this loop to exit after a max of window_size * 10 or
something like that iterations? Without capping it the threads become
way unbalanced in the end. In the gcc case one thread is continuing
30+ minutes past the others exiting.
> And again, both hash uses are deterministic i.e. they will be the same
> when repacking with -f regardless if the source pack is the 2.1GB or the
> 300MB one, so they may not explain the huge performance and memory usage
> discrepency you see between those two packs.
There is a correlation here but I don't know what it is. The memory
blow up occurs near the end of repacking. At the same time I move from
processing hundreds of objects per second to just a few per second.
And the threads are getting unbalanced.
I did notice that when I removed the above loop and evened things out
memory consumption did not spike as bad as it previously did. I maxed
out at 3GB instead of 4.5GB.
Linus suggested memory fragmentation could be a culprit. Evening the
threads out changed the allocation pattern. It is possible that it
avoided a fragmentation problem. It is also possible that evening
things out split the work so that less memory needed to be allocated.
Don't hold any of these numbers to be gospel. I am using the machine
for other things while I run these tests and there may be
interactions.
>
> The code that do get influenced by the source pack, though, is all
> concentrated in sha1_file.c.
>
>
> Nicolas
>
--
Jon Smirl
jonsmirl@gmail.com
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
2007-12-10 17:06 ` Jon Smirl
@ 2007-12-10 18:21 ` Nicolas Pitre
2007-12-10 19:19 ` [PATCH] pack-objects: more threaded load balancing fix with often changed paths Nicolas Pitre
0 siblings, 1 reply; 16+ messages in thread
From: Nicolas Pitre @ 2007-12-10 18:21 UTC (permalink / raw)
To: Jon Smirl; +Cc: Junio C Hamano, git
On Mon, 10 Dec 2007, Jon Smirl wrote:
> On 12/10/07, Nicolas Pitre <nico@cam.org> wrote:
>
> > The hash in the code above has to do with the file names the
> > corresponding objects are coming from.
>
> So can we change this loop to exit after a max of window_size * 10 or
> something like that iterations? Without capping it the threads become
> way unbalanced in the end. In the gcc case one thread is continuing
> 30+ minutes past the others exiting.
Indeed, some more tweaking are needed.
The object path distribution goes like this for the gcc repo:
105557 gcc
42537 gcc/ChangeLog
25210 gcc/config
20690 gcc/testsuite
13434 gcc/testsuite/ChangeLog
12363 libstdc++-v3
9346 gcc/cp
8757 libstdc++-v3/include
8186 gcc/version.c
7867 gcc/cp/ChangeLog
7737 libstdc++-v3/include/bits
7653 libjava
6577 gcc/testsuite/gcc.dg
5942 libjava/ChangeLog
5351 gcc/config/i386
5260 gcc/testsuite/g++.dg
4451 gcc/f
4330 libstdc++-v3/ChangeLog
4321 libstdc++-v3/include/bits/c++config
4316 gcc/doc
[...]
So... the top entries are most certainly going to create load balancing
issues if their path hash clustering isn't broken.
Nicolas
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH] pack-objects: more threaded load balancing fix with often changed paths
2007-12-10 18:21 ` Nicolas Pitre
@ 2007-12-10 19:19 ` Nicolas Pitre
0 siblings, 0 replies; 16+ messages in thread
From: Nicolas Pitre @ 2007-12-10 19:19 UTC (permalink / raw)
To: Junio C Hamano, Jon Smirl; +Cc: git
The code that splits the object list amongst work threads tries to do so
on "path" boundaries not to prevent good delta matches. However, in
some cases, a few paths may largely dominate the hash distribution and
it is not possible to have good load balancing without ignoring those
boundaries.
Signed-off-by: Nicolas Pitre <nico@cam.org>
---
On Mon, 10 Dec 2007, Nicolas Pitre wrote:
> On Mon, 10 Dec 2007, Jon Smirl wrote:
>
> > On 12/10/07, Nicolas Pitre <nico@cam.org> wrote:
> >
> > > The hash in the code above has to do with the file names the
> > > corresponding objects are coming from.
> >
> > So can we change this loop to exit after a max of window_size * 10 or
> > something like that iterations? Without capping it the threads become
> > way unbalanced in the end. In the gcc case one thread is continuing
> > 30+ minutes past the others exiting.
>
> Indeed, some more tweaking are needed.
>
> The object path distribution goes like this for the gcc repo:
>
> 105557 gcc
> 42537 gcc/ChangeLog
> 25210 gcc/config
> 20690 gcc/testsuite
> 13434 gcc/testsuite/ChangeLog
> 12363 libstdc++-v3
> 9346 gcc/cp
> 8757 libstdc++-v3/include
> 8186 gcc/version.c
> 7867 gcc/cp/ChangeLog
> 7737 libstdc++-v3/include/bits
> 7653 libjava
> 6577 gcc/testsuite/gcc.dg
> 5942 libjava/ChangeLog
> 5351 gcc/config/i386
> 5260 gcc/testsuite/g++.dg
> 4451 gcc/f
> 4330 libstdc++-v3/ChangeLog
> 4321 libstdc++-v3/include/bits/c++config
> 4316 gcc/doc
> [...]
>
> So... the top entries are most certainly going to create load balancing
> issues if their path hash clustering isn't broken.
Here's the fix.
diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index 250dc56..7dd0d7f 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -1709,6 +1709,16 @@ static void ll_find_deltas(struct object_entry **list, unsigned list_size,
list++;
sub_size--;
}
+ if (!sub_size) {
+ /*
+ * It is possible for some "paths" to have
+ * so many objects that no hash boundary
+ * might be found. Let's just steal the
+ * exact half in that case.
+ */
+ sub_size = victim->remaining / 2;
+ list -= sub_size;
+ }
target->list = list;
victim->list_size -= sub_size;
victim->remaining -= sub_size;
^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
2007-12-08 5:03 [PATCH 2/2] pack-objects: fix threaded load balancing Nicolas Pitre
` (2 preceding siblings ...)
2007-12-10 4:30 ` Jon Smirl
@ 2007-12-11 17:02 ` Johannes Sixt
2007-12-11 17:28 ` Nicolas Pitre
3 siblings, 1 reply; 16+ messages in thread
From: Johannes Sixt @ 2007-12-11 17:02 UTC (permalink / raw)
To: Nicolas Pitre; +Cc: Junio C Hamano, git, Jon Smirl
Nicolas Pitre schrieb:
> @@ -1612,10 +1620,10 @@ static void *threaded_find_deltas(void *arg)
> pthread_mutex_lock(&data_ready);
> pthread_mutex_unlock(&data_request);
>
> - if (!me->list_size)
> + if (!me->remaining)
> return NULL;
>
> - find_deltas(me->list, me->list_size,
> + find_deltas(me->list, &me->remaining,
> me->window, me->depth, me->processed);
> }
> }
This hunk caught my attention. &data_ready is locked, but not released in
this function.
Looking more closely at the code surrounding this hunk, it seems that the
lock is released in a *different* thread than the one that locked it. This
works on Linux, but is not portable. We will have to use condition variables
like every one else does in a producer-consumer-like scenario.
-- Hannes
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
2007-12-11 17:02 ` [PATCH 2/2] pack-objects: fix threaded load balancing Johannes Sixt
@ 2007-12-11 17:28 ` Nicolas Pitre
2007-12-13 7:15 ` Johannes Sixt
0 siblings, 1 reply; 16+ messages in thread
From: Nicolas Pitre @ 2007-12-11 17:28 UTC (permalink / raw)
To: Johannes Sixt; +Cc: Junio C Hamano, git, Jon Smirl
On Tue, 11 Dec 2007, Johannes Sixt wrote:
> Nicolas Pitre schrieb:
> > @@ -1612,10 +1620,10 @@ static void *threaded_find_deltas(void *arg)
> > pthread_mutex_lock(&data_ready);
> > pthread_mutex_unlock(&data_request);
> >
> > - if (!me->list_size)
> > + if (!me->remaining)
> > return NULL;
> >
> > - find_deltas(me->list, me->list_size,
> > + find_deltas(me->list, &me->remaining,
> > me->window, me->depth, me->processed);
> > }
> > }
>
> This hunk caught my attention. &data_ready is locked, but not released in
> this function.
>
> Looking more closely at the code surrounding this hunk, it seems that the
> lock is released in a *different* thread than the one that locked it. This
> works on Linux, but is not portable. We will have to use condition variables
> like every one else does in a producer-consumer-like scenario.
Are you willing to make a patch for it?
Nicolas
^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
2007-12-11 17:28 ` Nicolas Pitre
@ 2007-12-13 7:15 ` Johannes Sixt
0 siblings, 0 replies; 16+ messages in thread
From: Johannes Sixt @ 2007-12-13 7:15 UTC (permalink / raw)
To: Nicolas Pitre; +Cc: Junio C Hamano, git, Jon Smirl
Nicolas Pitre schrieb:
> On Tue, 11 Dec 2007, Johannes Sixt wrote:
>
>> Nicolas Pitre schrieb:
>>> @@ -1612,10 +1620,10 @@ static void *threaded_find_deltas(void *arg)
>>> pthread_mutex_lock(&data_ready);
>>> pthread_mutex_unlock(&data_request);
>>>
>>> - if (!me->list_size)
>>> + if (!me->remaining)
>>> return NULL;
>>>
>>> - find_deltas(me->list, me->list_size,
>>> + find_deltas(me->list, &me->remaining,
>>> me->window, me->depth, me->processed);
>>> }
>>> }
>> This hunk caught my attention. &data_ready is locked, but not released in
>> this function.
>>
>> Looking more closely at the code surrounding this hunk, it seems that the
>> lock is released in a *different* thread than the one that locked it. This
>> works on Linux, but is not portable. We will have to use condition variables
>> like every one else does in a producer-consumer-like scenario.
>
> Are you willing to make a patch for it?
I'm working on it, slowly.
-- Hannes
^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2007-12-13 7:17 UTC | newest]
Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-12-08 5:03 [PATCH 2/2] pack-objects: fix threaded load balancing Nicolas Pitre
2007-12-08 9:18 ` Jeff King
2007-12-10 4:10 ` Jon Smirl
2007-12-10 4:30 ` Jon Smirl
2007-12-10 5:23 ` Jon Smirl
2007-12-10 5:59 ` Jon Smirl
2007-12-10 6:06 ` Jon Smirl
2007-12-10 6:19 ` Jon Smirl
2007-12-10 16:03 ` Nicolas Pitre
2007-12-10 16:14 ` Nicolas Pitre
2007-12-10 17:06 ` Jon Smirl
2007-12-10 18:21 ` Nicolas Pitre
2007-12-10 19:19 ` [PATCH] pack-objects: more threaded load balancing fix with often changed paths Nicolas Pitre
2007-12-11 17:02 ` [PATCH 2/2] pack-objects: fix threaded load balancing Johannes Sixt
2007-12-11 17:28 ` Nicolas Pitre
2007-12-13 7:15 ` Johannes Sixt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).