[PATCH 2/2] pack-objects: fix threaded load balancing

git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 2/2] pack-objects: fix threaded load balancing
@ 2007-12-08  5:03 Nicolas Pitre
  2007-12-08  9:18 ` Jeff King
                   ` (3 more replies)
  0 siblings, 4 replies; 16+ messages in thread
From: Nicolas Pitre @ 2007-12-08  5:03 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git, Jon Smirl


The current method consists of a master thread serving chunks of objects
to work threads when they're done with their previous chunk.  The issue
is to determine the best chunk size: making it too large creates poor
load balancing, while making it too small has a negative effect on pack
size because of the increased number of chunk boundaries and poor delta
window utilization.

This patch implements a completely different approach by initially
splitting the work in large chunks uniformly amongst all threads, and
whenever a thread is done then it steals half of the remaining work from
another thread with the largest amount of unprocessed objects.

This has the advantage of greatly reducing the number of chunk boundaries
with an almost perfect load balancing.

Signed-off-by: Nicolas Pitre <nico@cam.org>
---
 builtin-pack-objects.c |  117 +++++++++++++++++++++++++++++++++++-------------
 1 files changed, 85 insertions(+), 32 deletions(-)

diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index 5002cc6..fcc1901 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -1479,10 +1479,10 @@ static unsigned long free_unpacked(struct unpacked *n)
 	return freed_mem;
 }
 
-static void find_deltas(struct object_entry **list, unsigned list_size,
+static void find_deltas(struct object_entry **list, unsigned *list_size,
 			int window, int depth, unsigned *processed)
 {
-	uint32_t i = 0, idx = 0, count = 0;
+	uint32_t i, idx = 0, count = 0;
 	unsigned int array_size = window * sizeof(struct unpacked);
 	struct unpacked *array;
 	unsigned long mem_usage = 0;
@@ -1490,11 +1490,23 @@ static void find_deltas(struct object_entry **list, unsigned list_size,
 	array = xmalloc(array_size);
 	memset(array, 0, array_size);
 
-	do {
-		struct object_entry *entry = list[i++];
+	for (;;) {
+		struct object_entry *entry = *list++;
 		struct unpacked *n = array + idx;
 		int j, max_depth, best_base = -1;
 
+		progress_lock();
+		if (!*list_size) {
+			progress_unlock();
+			break;
+		}
+		(*list_size)--;
+		if (!entry->preferred_base) {
+			(*processed)++;
+			display_progress(progress_state, *processed);
+		}
+		progress_unlock();
+
 		mem_usage -= free_unpacked(n);
 		n->entry = entry;
 
@@ -1512,11 +1524,6 @@ static void find_deltas(struct object_entry **list, unsigned list_size,
 		if (entry->preferred_base)
 			goto next;
 
-		progress_lock();
-		(*processed)++;
-		display_progress(progress_state, *processed);
-		progress_unlock();
-
 		/*
 		 * If the current object is at pack edge, take the depth the
 		 * objects that depend on the current object into account
@@ -1576,7 +1583,7 @@ static void find_deltas(struct object_entry **list, unsigned list_size,
 			count++;
 		if (idx >= window)
 			idx = 0;
-	} while (i < list_size);
+	}
 
 	for (i = 0; i < window; ++i) {
 		free_delta_index(array[i].index);
@@ -1591,6 +1598,7 @@ struct thread_params {
 	pthread_t thread;
 	struct object_entry **list;
 	unsigned list_size;
+	unsigned remaining;
 	int window;
 	int depth;
 	unsigned *processed;
@@ -1612,10 +1620,10 @@ static void *threaded_find_deltas(void *arg)
 		pthread_mutex_lock(&data_ready);
 		pthread_mutex_unlock(&data_request);
 
-		if (!me->list_size)
+		if (!me->remaining)
 			return NULL;
 
-		find_deltas(me->list, me->list_size,
+		find_deltas(me->list, &me->remaining,
 			    me->window, me->depth, me->processed);
 	}
 }
@@ -1624,57 +1632,102 @@ static void ll_find_deltas(struct object_entry **list, unsigned list_size,
 			   int window, int depth, unsigned *processed)
 {
 	struct thread_params *target, p[delta_search_threads];
-	int i, ret;
-	unsigned chunk_size;
+	int i, ret, active_threads = 0;
 
 	if (delta_search_threads <= 1) {
-		find_deltas(list, list_size, window, depth, processed);
+		find_deltas(list, &list_size, window, depth, processed);
 		return;
 	}
 
 	pthread_mutex_lock(&data_provider);
 	pthread_mutex_lock(&data_ready);
 
+	/* Start work threads. */
 	for (i = 0; i < delta_search_threads; i++) {
 		p[i].window = window;
 		p[i].depth = depth;
 		p[i].processed = processed;
+		p[i].remaining = 0;
 		ret = pthread_create(&p[i].thread, NULL,
 				     threaded_find_deltas, &p[i]);
 		if (ret)
 			die("unable to create thread: %s", strerror(ret));
+		active_threads++;
 	}
 
-	/* this should be auto-tuned somehow */
-	chunk_size = window * 1000;
+	/* Then partition the work amongst them. */
+	for (i = 0; i < delta_search_threads; i++) {
+		unsigned sub_size = list_size / (delta_search_threads - i);
 
-	do {
-		unsigned sublist_size = chunk_size;
-		if (sublist_size > list_size)
-			sublist_size = list_size;
+		pthread_mutex_lock(&data_provider);
+		target = data_requester;
+		if (!sub_size) {
+			pthread_mutex_unlock(&data_ready);
+			pthread_join(target->thread, NULL);
+			active_threads--;
+			continue;
+		}
 
 		/* try to split chunks on "path" boundaries */
-		while (sublist_size < list_size && list[sublist_size]->hash &&
-		       list[sublist_size]->hash == list[sublist_size-1]->hash)
-			sublist_size++;
+		while (sub_size < list_size && list[sub_size]->hash &&
+		       list[sub_size]->hash == list[sub_size-1]->hash)
+			sub_size++;
+
+		target->list = list;
+		target->list_size = sub_size;
+		target->remaining = sub_size;
+		pthread_mutex_unlock(&data_ready);
 
+		list += sub_size;
+		list_size -= sub_size;
+	}
+
+	/*
+	 * Now let's wait for work completion.  Each time a thread is done
+	 * with its work, we steal half of the remaining work from the
+	 * thread with the largest number of unprocessed objects and give
+	 * it to that newly idle thread.  This ensure good load balancing
+	 * until the remaining object list segments are simply too short
+	 * to be worth splitting anymore.
+	 */
+	do {
+		struct thread_params *victim = NULL;
+		unsigned sub_size = 0;
 		pthread_mutex_lock(&data_provider);
 		target = data_requester;
-		target->list = list;
-		target->list_size = sublist_size;
+
+		progress_lock();
+		for (i = 0; i < delta_search_threads; i++)
+			if (p[i].remaining > 2*window &&
+			    (!victim || victim->remaining < p[i].remaining))
+				victim = &p[i];
+		if (victim) {
+			sub_size = victim->remaining / 2;
+			list = victim->list + victim->list_size - sub_size;
+			while (sub_size && list[0]->hash &&
+			       list[0]->hash == list[-1]->hash) {
+				list++;
+				sub_size--;
+			}
+			target->list = list;
+			victim->list_size -= sub_size;
+			victim->remaining -= sub_size;
+		}
+		progress_unlock();
+
+		target->list_size = sub_size;
+		target->remaining = sub_size;
 		pthread_mutex_unlock(&data_ready);
 
-		list += sublist_size;
-		list_size -= sublist_size;
-		if (!sublist_size) {
+		if (!sub_size) {
 			pthread_join(target->thread, NULL);
-			i--;
+			active_threads--;
 		}
-	} while (i);
+	} while (active_threads);
 }
 
 #else
-#define ll_find_deltas find_deltas
+#define ll_find_deltas(l, s, w, d, p)	find_deltas(l, &s, w, d, p)
 #endif
 
 static void prepare_pack(int window, int depth)
-- 
1.5.3.7.2184.ge321d-dirty

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
  2007-12-08  5:03 [PATCH 2/2] pack-objects: fix threaded load balancing Nicolas Pitre
@ 2007-12-08  9:18 ` Jeff King
  2007-12-10  4:10 ` Jon Smirl
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 16+ messages in thread
From: Jeff King @ 2007-12-08  9:18 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git, Jon Smirl

On Sat, Dec 08, 2007 at 12:03:17AM -0500, Nicolas Pitre wrote:

> This patch implements a completely different approach by initially
> splitting the work in large chunks uniformly amongst all threads, and
> whenever a thread is done then it steals half of the remaining work from
> another thread with the largest amount of unprocessed objects.

Yay! This works great on my "pack a few hundred very large objects" repo.

-Peff

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
  2007-12-08  5:03 [PATCH 2/2] pack-objects: fix threaded load balancing Nicolas Pitre
  2007-12-08  9:18 ` Jeff King
@ 2007-12-10  4:10 ` Jon Smirl
  2007-12-10  4:30 ` Jon Smirl
  2007-12-11 17:02 ` [PATCH 2/2] pack-objects: fix threaded load balancing Johannes Sixt
  3 siblings, 0 replies; 16+ messages in thread
From: Jon Smirl @ 2007-12-10  4:10 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git

I added a couple of printfs to debug this.
Why did the thread exit when there were still 72,000 objects left?
Another one exited with 50,000 objects left.
The repack is still running, it has been proceeding on two cores for
10 minutes after the two threads exited.

jonsmirl@terra:/video/gcc$ git-repack -a -d -f --depth=250 --window=250
Counting objects: 828348, done.

 starting thread 0
Compressing objects:   0% (1/809010)
 starting thread 1

 starting thread 2

 starting thread 3
Compressing objects:  59% (478011/809010)
 victim 0x7fffc7976480 sub_size 76058

thread 0x7fffc79764f8 sub_size 76058
Compressing objects:  62% (504273/809010)
 victim 0x7fffc79764a8 sub_size 69967

thread 0x7fffc79764d0 sub_size 69967
Compressing objects:  82% (664231/809010)
 victim 0x7fffc7976480 sub_size 35308

thread 0x7fffc79764d0 sub_size 35308
Compressing objects:  91% (736690/809010)
 victim 0x7fffc79764d0 sub_size 0

thread 0x7fffc79764f8 sub_size 0
Compressing objects:  93% (758922/809010)
 victim 0x7fffc79764d0 sub_size 0

thread 0x7fffc79764a8 sub_size 0




-- 
Jon Smirl
jonsmirl@gmail.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
  2007-12-08  5:03 [PATCH 2/2] pack-objects: fix threaded load balancing Nicolas Pitre
  2007-12-08  9:18 ` Jeff King
  2007-12-10  4:10 ` Jon Smirl
@ 2007-12-10  4:30 ` Jon Smirl
  2007-12-10  5:23   ` Jon Smirl
  2007-12-11 17:02 ` [PATCH 2/2] pack-objects: fix threaded load balancing Johannes Sixt
  3 siblings, 1 reply; 16+ messages in thread
From: Jon Smirl @ 2007-12-10  4:30 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git

On 12/8/07, Nicolas Pitre <nico@cam.org> wrote:
>
> The current method consists of a master thread serving chunks of objects
> to work threads when they're done with their previous chunk.  The issue
> is to determine the best chunk size: making it too large creates poor
> load balancing, while making it too small has a negative effect on pack
> size because of the increased number of chunk boundaries and poor delta
> window utilization.
>
> This patch implements a completely different approach by initially
> splitting the work in large chunks uniformly amongst all threads, and
> whenever a thread is done then it steals half of the remaining work from
> another thread with the largest amount of unprocessed objects.
>
> This has the advantage of greatly reducing the number of chunk boundaries
> with an almost perfect load balancing.
>
> Signed-off-by: Nicolas Pitre <nico@cam.org>
> ---
>  builtin-pack-objects.c |  117 +++++++++++++++++++++++++++++++++++-------------
>  1 files changed, 85 insertions(+), 32 deletions(-)
>
> diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
> index 5002cc6..fcc1901 100644
> --- a/builtin-pack-objects.c
> +++ b/builtin-pack-objects.c
> @@ -1479,10 +1479,10 @@ static unsigned long free_unpacked(struct unpacked *n)
>         return freed_mem;
>  }
>
> -static void find_deltas(struct object_entry **list, unsigned list_size,
> +static void find_deltas(struct object_entry **list, unsigned *list_size,
>                         int window, int depth, unsigned *processed)
>  {
> -       uint32_t i = 0, idx = 0, count = 0;
> +       uint32_t i, idx = 0, count = 0;
>         unsigned int array_size = window * sizeof(struct unpacked);
>         struct unpacked *array;
>         unsigned long mem_usage = 0;
> @@ -1490,11 +1490,23 @@ static void find_deltas(struct object_entry **list, unsigned list_size,
>         array = xmalloc(array_size);
>         memset(array, 0, array_size);
>
> -       do {
> -               struct object_entry *entry = list[i++];
> +       for (;;) {
> +               struct object_entry *entry = *list++;
>                 struct unpacked *n = array + idx;
>                 int j, max_depth, best_base = -1;
>
> +               progress_lock();
> +               if (!*list_size) {
> +                       progress_unlock();
> +                       break;
> +               }
> +               (*list_size)--;
> +               if (!entry->preferred_base) {
> +                       (*processed)++;
> +                       display_progress(progress_state, *processed);
> +               }
> +               progress_unlock();
> +
>                 mem_usage -= free_unpacked(n);
>                 n->entry = entry;
>
> @@ -1512,11 +1524,6 @@ static void find_deltas(struct object_entry **list, unsigned list_size,
>                 if (entry->preferred_base)
>                         goto next;
>
> -               progress_lock();
> -               (*processed)++;
> -               display_progress(progress_state, *processed);
> -               progress_unlock();
> -
>                 /*
>                  * If the current object is at pack edge, take the depth the
>                  * objects that depend on the current object into account
> @@ -1576,7 +1583,7 @@ static void find_deltas(struct object_entry **list, unsigned list_size,
>                         count++;
>                 if (idx >= window)
>                         idx = 0;
> -       } while (i < list_size);
> +       }
>
>         for (i = 0; i < window; ++i) {
>                 free_delta_index(array[i].index);
> @@ -1591,6 +1598,7 @@ struct thread_params {
>         pthread_t thread;
>         struct object_entry **list;
>         unsigned list_size;
> +       unsigned remaining;
>         int window;
>         int depth;
>         unsigned *processed;
> @@ -1612,10 +1620,10 @@ static void *threaded_find_deltas(void *arg)
>                 pthread_mutex_lock(&data_ready);
>                 pthread_mutex_unlock(&data_request);
>
> -               if (!me->list_size)
> +               if (!me->remaining)
>                         return NULL;
>
> -               find_deltas(me->list, me->list_size,
> +               find_deltas(me->list, &me->remaining,
>                             me->window, me->depth, me->processed);
>         }
>  }
> @@ -1624,57 +1632,102 @@ static void ll_find_deltas(struct object_entry **list, unsigned list_size,
>                            int window, int depth, unsigned *processed)
>  {
>         struct thread_params *target, p[delta_search_threads];
> -       int i, ret;
> -       unsigned chunk_size;
> +       int i, ret, active_threads = 0;
>
>         if (delta_search_threads <= 1) {
> -               find_deltas(list, list_size, window, depth, processed);
> +               find_deltas(list, &list_size, window, depth, processed);
>                 return;
>         }
>
>         pthread_mutex_lock(&data_provider);
>         pthread_mutex_lock(&data_ready);
>
> +       /* Start work threads. */
>         for (i = 0; i < delta_search_threads; i++) {
>                 p[i].window = window;
>                 p[i].depth = depth;
>                 p[i].processed = processed;
> +               p[i].remaining = 0;
>                 ret = pthread_create(&p[i].thread, NULL,
>                                      threaded_find_deltas, &p[i]);
>                 if (ret)
>                         die("unable to create thread: %s", strerror(ret));
> +               active_threads++;
>         }
>
> -       /* this should be auto-tuned somehow */
> -       chunk_size = window * 1000;
> +       /* Then partition the work amongst them. */
> +       for (i = 0; i < delta_search_threads; i++) {
> +               unsigned sub_size = list_size / (delta_search_threads - i);
>
> -       do {
> -               unsigned sublist_size = chunk_size;
> -               if (sublist_size > list_size)
> -                       sublist_size = list_size;
> +               pthread_mutex_lock(&data_provider);
> +               target = data_requester;
> +               if (!sub_size) {
> +                       pthread_mutex_unlock(&data_ready);
> +                       pthread_join(target->thread, NULL);
> +                       active_threads--;
> +                       continue;
> +               }
>
>                 /* try to split chunks on "path" boundaries */
> -               while (sublist_size < list_size && list[sublist_size]->hash &&
> -                      list[sublist_size]->hash == list[sublist_size-1]->hash)
> -                       sublist_size++;
> +               while (sub_size < list_size && list[sub_size]->hash &&
> +                      list[sub_size]->hash == list[sub_size-1]->hash)
> +                       sub_size++;
> +
> +               target->list = list;
> +               target->list_size = sub_size;
> +               target->remaining = sub_size;
> +               pthread_mutex_unlock(&data_ready);
>
> +               list += sub_size;
> +               list_size -= sub_size;
> +       }
> +
> +       /*
> +        * Now let's wait for work completion.  Each time a thread is done
> +        * with its work, we steal half of the remaining work from the
> +        * thread with the largest number of unprocessed objects and give
> +        * it to that newly idle thread.  This ensure good load balancing
> +        * until the remaining object list segments are simply too short
> +        * to be worth splitting anymore.
> +        */
> +       do {
> +               struct thread_params *victim = NULL;
> +               unsigned sub_size = 0;
>                 pthread_mutex_lock(&data_provider);
>                 target = data_requester;
> -               target->list = list;
> -               target->list_size = sublist_size;
> +
> +               progress_lock();
> +               for (i = 0; i < delta_search_threads; i++)
> +                       if (p[i].remaining > 2*window &&
> +                           (!victim || victim->remaining < p[i].remaining))
> +                               victim = &p[i];
> +               if (victim) {
> +                       sub_size = victim->remaining / 2;
> +                       list = victim->list + victim->list_size - sub_size;
> +                       while (sub_size && list[0]->hash &&
> +                              list[0]->hash == list[-1]->hash) {
> +                               list++;

I think you needed to copy sub_size to another variable for this loop

> +                               sub_size--;
> +                       }
> +                       target->list = list;
> +                       victim->list_size -= sub_size;
> +                       victim->remaining -= sub_size;
> +               }
> +               progress_unlock();
> +
> +               target->list_size = sub_size;
> +               target->remaining = sub_size;
>                 pthread_mutex_unlock(&data_ready);
>
> -               list += sublist_size;
> -               list_size -= sublist_size;
> -               if (!sublist_size) {
> +               if (!sub_size) {
>                         pthread_join(target->thread, NULL);
> -                       i--;
> +                       active_threads--;
>                 }
> -       } while (i);
> +       } while (active_threads);
>  }
>
>  #else
> -#define ll_find_deltas find_deltas
> +#define ll_find_deltas(l, s, w, d, p)  find_deltas(l, &s, w, d, p)
>  #endif
>
>  static void prepare_pack(int window, int depth)
> --
> 1.5.3.7.2184.ge321d-dirty
>
>


-- 
Jon Smirl
jonsmirl@gmail.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
  2007-12-10  4:30 ` Jon Smirl
@ 2007-12-10  5:23   ` Jon Smirl
  2007-12-10  5:59     ` Jon Smirl
  0 siblings, 1 reply; 16+ messages in thread
From: Jon Smirl @ 2007-12-10  5:23 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git

On 12/9/07, Jon Smirl <jonsmirl@gmail.com> wrote:
> > +               if (victim) {
> > +                       sub_size = victim->remaining / 2;
> > +                       list = victim->list + victim->list_size - sub_size;
> > +                       while (sub_size && list[0]->hash &&
> > +                              list[0]->hash == list[-1]->hash) {
> > +                               list++;
>
> I think you needed to copy sub_size to another variable for this loop

Copying sub_size was wrong. I believe you are checking for deltas on
the same file. It's probably that chain of 103,817 deltas that can't
be broken up.

ChainLength     Objects Cumulative
1:      103817  103817
2:      67332   171149
3:      57520   228669
4:      52570   281239
5:      43910   325149
6:      37520   362669
7:      35248   397917
8:      29819   427736
9:      27619   455355
10:     22656   478011
11:     21073   499084
...

>
> > +                               sub_size--;
> > +                       }
> > +                       target->list = list;
> > +                       victim->list_size -= sub_size;
> > +                       victim->remaining -= sub_size;
> > +               }



-- 
Jon Smirl
jonsmirl@gmail.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
  2007-12-10  5:23   ` Jon Smirl
@ 2007-12-10  5:59     ` Jon Smirl
  2007-12-10  6:06       ` Jon Smirl
  0 siblings, 1 reply; 16+ messages in thread
From: Jon Smirl @ 2007-12-10  5:59 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git

I just deleted the section looking for identical hashes.

+                       while (sub_size && list[0]->hash &&
+                              list[0]->hash == list[-1]->hash) {
+                               list++;
+                               sub_size--;
+                       }

Doing that allows the long chains to be split over the cores.

My last 5% of objects is taking over 50% of the total CPU time in the
repack. I think these objects are the ones from that 103,817 entry
chain. It is also causing the explosion in RAM consumption.

At the end I can only do 20 objects per clock second on four cores. It
takes 30 clock minutes (120 CPU minutes) to do the last 3% of objects.

Can the chains be limited to not grow over some reasonable number, say
5,000? It will make the pack a little bigger but it will help a lot
with performance.

-- 
Jon Smirl
jonsmirl@gmail.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
  2007-12-10  5:59     ` Jon Smirl
@ 2007-12-10  6:06       ` Jon Smirl
  2007-12-10  6:19         ` Jon Smirl
  2007-12-10 16:14         ` Nicolas Pitre
  0 siblings, 2 replies; 16+ messages in thread
From: Jon Smirl @ 2007-12-10  6:06 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git

On 12/10/07, Jon Smirl <jonsmirl@gmail.com> wrote:
> I just deleted the section looking for identical hashes.
>
> +                       while (sub_size && list[0]->hash &&
> +                              list[0]->hash == list[-1]->hash) {
> +                               list++;
> +                               sub_size--;
> +                       }
>
> Doing that allows the long chains to be split over the cores.
>
> My last 5% of objects is taking over 50% of the total CPU time in the
> repack. I think these objects are the ones from that 103,817 entry
> chain. It is also causing the explosion in RAM consumption.
>
> At the end I can only do 20 objects per clock second on four cores. It
> takes 30 clock minutes (120 CPU minutes) to do the last 3% of objects.

It's all in create_delta...

samples  %        symbol name
10344074 98.5961  create_delta
138010    1.3155  create_delta_index
4380      0.0417  find_deltas
2526      0.0241  patch_delta
776       0.0074  unpack_entry



>
> Can the chains be limited to not grow over some reasonable number, say
> 5,000? It will make the pack a little bigger but it will help a lot
> with performance.
>
> --
> Jon Smirl
> jonsmirl@gmail.com
>


-- 
Jon Smirl
jonsmirl@gmail.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
  2007-12-10  6:06       ` Jon Smirl
@ 2007-12-10  6:19         ` Jon Smirl
  2007-12-10 16:03           ` Nicolas Pitre
  2007-12-10 16:14         ` Nicolas Pitre
  1 sibling, 1 reply; 16+ messages in thread
From: Jon Smirl @ 2007-12-10  6:19 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git

It seems to be caused by looping over those runs of 100,000 identical
hash entries.
5091699 24.6407 :                       for (entry = index->hash[i];
entry < index->hash[i+1]; entry++) {
5301791 25.6575 :                               const unsigned char
*ref = entry->ptr;



228772  1.1071 :        while (data < top) {
223646  1.0823 :                if (msize < 4096) {
               :                        struct index_entry *entry;
  5862  0.0284 :                        val ^= U[data[-RABIN_WINDOW]];
753004  3.6441 :                        val = ((val << 8) | *data) ^
T[val >> RABIN_SHIFT];
1232556  5.9648 :                       i = val & index->hash_mask;
5091699 24.6407 :                       for (entry = index->hash[i];
entry < index->hash[i+1]; entry++) {
5301791 25.6575 :                               const unsigned char
*ref = entry->ptr;
               :                                const unsigned char *src = data;
 25919  0.1254 :                                unsigned int ref_size
= ref_top - ref;
740077  3.5815 :                                if (entry->val != val)
               :                                        continue;
   331  0.0016 :                                if (ref_size > top - src)
 83804  0.4056 :                                        ref_size = top - src;
 25059  0.1213 :                                if (ref_size <= msize)
               :                                        break;
1269621  6.1442 :                               while (ref_size-- &&
*src++ == *ref)
 42122  0.2038 :                                        ref++;
 14362  0.0695 :                                if (msize < ref - entry->ptr) {
               :                                        /* this is our
best match so far */
 10452  0.0506 :                                        msize = ref -
entry->ptr;
  6882  0.0333 :                                        moff =
entry->ptr - ref_data;
  6382  0.0309 :                                        if (msize >=
4096) /* good enough */
               :                                                break;
               :                                }
               :                        }


-- 
Jon Smirl
jonsmirl@gmail.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
  2007-12-10  6:19         ` Jon Smirl
@ 2007-12-10 16:03           ` Nicolas Pitre
  0 siblings, 0 replies; 16+ messages in thread
From: Nicolas Pitre @ 2007-12-10 16:03 UTC (permalink / raw)
  To: Jon Smirl; +Cc: Junio C Hamano, git

On Mon, 10 Dec 2007, Jon Smirl wrote:

> It seems to be caused by looping over those runs of 100,000 identical
> hash entries.

As mentioned earlier, there simply can't be 100000 identical hash 
entries.

> 5091699 24.6407 :                       for (entry = index->hash[i]; entry < index->hash[i+1]; entry++) {
> 5301791 25.6575 :                               const unsigned char *ref = entry->ptr;

This loop is the very heart of the delta search code.  It is normal to 
see it high on the graph.  In some worst cases, this loop might even be 
entered for every byte in the source buffer.

And yet I bring you back to your observation about the fact that the 
source pack used for the repack (whether it is the 2.1GB pack or the 
300MB pack) has a great influence on the performance outcome.  The code 
above, though, would be executed the very exact number of times, and the 
exact amount of memory allocated, regardless of the source pack used (as 
long as you use -f with 'git repack' of course).

So this cannot be the culprit.

Nicolas

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
  2007-12-10  6:06       ` Jon Smirl
  2007-12-10  6:19         ` Jon Smirl
@ 2007-12-10 16:14         ` Nicolas Pitre
  2007-12-10 17:06           ` Jon Smirl
  1 sibling, 1 reply; 16+ messages in thread
From: Nicolas Pitre @ 2007-12-10 16:14 UTC (permalink / raw)
  To: Jon Smirl; +Cc: Junio C Hamano, git

On Mon, 10 Dec 2007, Jon Smirl wrote:

> On 12/10/07, Jon Smirl <jonsmirl@gmail.com> wrote:
> > I just deleted the section looking for identical hashes.
> >
> > +                       while (sub_size && list[0]->hash &&
> > +                              list[0]->hash == list[-1]->hash) {
> > +                               list++;
> > +                               sub_size--;
> > +                       }
> >
> > Doing that allows the long chains to be split over the cores.
> >
> > My last 5% of objects is taking over 50% of the total CPU time in the
> > repack. I think these objects are the ones from that 103,817 entry
> > chain. It is also causing the explosion in RAM consumption.
> >
> > At the end I can only do 20 objects per clock second on four cores. It
> > takes 30 clock minutes (120 CPU minutes) to do the last 3% of objects.
> 
> It's all in create_delta...

Here you're mixing two different hashes with no relation what so ever 
with each other.

The hash in create_delta corresponds to chunk of data in a reference 
buffer that we try to match in a source buffer.

The hash in the code above has to do with the file names the 
corresponding objects are coming from.

And again, both hash uses are deterministic i.e. they will be the same 
when repacking with -f regardless if the source pack is the 2.1GB or the 
300MB one, so they may not explain the huge performance and memory usage 
discrepency you see between those two packs.

The code that do get influenced by the source pack, though, is all 
concentrated in sha1_file.c.

Nicolas

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
  2007-12-10 16:14         ` Nicolas Pitre
@ 2007-12-10 17:06           ` Jon Smirl
  2007-12-10 18:21             ` Nicolas Pitre
  0 siblings, 1 reply; 16+ messages in thread
From: Jon Smirl @ 2007-12-10 17:06 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git

On 12/10/07, Nicolas Pitre <nico@cam.org> wrote:
> On Mon, 10 Dec 2007, Jon Smirl wrote:
>
> > On 12/10/07, Jon Smirl <jonsmirl@gmail.com> wrote:
> > > I just deleted the section looking for identical hashes.
> > >
> > > +                       while (sub_size && list[0]->hash &&
> > > +                              list[0]->hash == list[-1]->hash) {
> > > +                               list++;
> > > +                               sub_size--;
> > > +                       }
> > >
> > > Doing that allows the long chains to be split over the cores.
> > >
> > > My last 5% of objects is taking over 50% of the total CPU time in the
> > > repack. I think these objects are the ones from that 103,817 entry
> > > chain. It is also causing the explosion in RAM consumption.
> > >
> > > At the end I can only do 20 objects per clock second on four cores. It
> > > takes 30 clock minutes (120 CPU minutes) to do the last 3% of objects.
> >
> > It's all in create_delta...
>
> Here you're mixing two different hashes with no relation what so ever
> with each other.
>
> The hash in create_delta corresponds to chunk of data in a reference
> buffer that we try to match in a source buffer.
>
> The hash in the code above has to do with the file names the
> corresponding objects are coming from.

So can we change this loop to exit after a max of window_size * 10 or
something like that iterations? Without capping it the threads become
way unbalanced in the end. In the gcc case one thread is continuing
30+ minutes past the others exiting.

> And again, both hash uses are deterministic i.e. they will be the same
> when repacking with -f regardless if the source pack is the 2.1GB or the
> 300MB one, so they may not explain the huge performance and memory usage
> discrepency you see between those two packs.

There is a correlation here but I don't know what it is. The memory
blow up occurs near the end of repacking. At the same time I move from
processing hundreds of objects per second to just a few per second.
And the threads are getting unbalanced.

I did notice that when I removed the above loop and evened things out
memory consumption did not spike as bad as it previously did. I maxed
out at 3GB instead of 4.5GB.

Linus suggested memory fragmentation could be a culprit. Evening the
threads out changed the allocation pattern. It is possible that it
avoided a fragmentation problem. It is also possible that evening
things out split the work so that less memory needed to be allocated.

Don't hold any of these numbers to be gospel. I am using the machine
for other things while I run these tests and there may be
interactions.

>
> The code that do get influenced by the source pack, though, is all
> concentrated in sha1_file.c.
>
>
> Nicolas
>

-- 
Jon Smirl
jonsmirl@gmail.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
  2007-12-10 17:06           ` Jon Smirl
@ 2007-12-10 18:21             ` Nicolas Pitre
  2007-12-10 19:19               ` [PATCH] pack-objects: more threaded load balancing fix with often changed paths Nicolas Pitre
  0 siblings, 1 reply; 16+ messages in thread
From: Nicolas Pitre @ 2007-12-10 18:21 UTC (permalink / raw)
  To: Jon Smirl; +Cc: Junio C Hamano, git

On Mon, 10 Dec 2007, Jon Smirl wrote:

> On 12/10/07, Nicolas Pitre <nico@cam.org> wrote:
> 
> > The hash in the code above has to do with the file names the
> > corresponding objects are coming from.
> 
> So can we change this loop to exit after a max of window_size * 10 or
> something like that iterations? Without capping it the threads become
> way unbalanced in the end. In the gcc case one thread is continuing
> 30+ minutes past the others exiting.

Indeed, some more tweaking are needed.

The object path distribution goes like this for the gcc repo:

105557  gcc
42537   gcc/ChangeLog
25210   gcc/config
20690   gcc/testsuite
13434   gcc/testsuite/ChangeLog
12363   libstdc++-v3
9346    gcc/cp
8757    libstdc++-v3/include
8186    gcc/version.c
7867    gcc/cp/ChangeLog
7737    libstdc++-v3/include/bits
7653    libjava
6577    gcc/testsuite/gcc.dg
5942    libjava/ChangeLog
5351    gcc/config/i386
5260    gcc/testsuite/g++.dg
4451    gcc/f
4330    libstdc++-v3/ChangeLog
4321    libstdc++-v3/include/bits/c++config
4316    gcc/doc
[...]

So... the top entries are most certainly going to create load balancing 
issues if their path hash clustering isn't broken.


Nicolas

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH] pack-objects: more threaded load balancing fix with often changed paths
  2007-12-10 18:21             ` Nicolas Pitre
@ 2007-12-10 19:19               ` Nicolas Pitre
  0 siblings, 0 replies; 16+ messages in thread
From: Nicolas Pitre @ 2007-12-10 19:19 UTC (permalink / raw)
  To: Junio C Hamano, Jon Smirl; +Cc: git


The code that splits the object list amongst work threads tries to do so
on "path" boundaries not to prevent good delta matches.  However, in 
some cases, a few paths may largely dominate the hash distribution and
it is not possible to have good load balancing without ignoring those 
boundaries.

Signed-off-by: Nicolas Pitre <nico@cam.org>
---

On Mon, 10 Dec 2007, Nicolas Pitre wrote:

> On Mon, 10 Dec 2007, Jon Smirl wrote:
> 
> > On 12/10/07, Nicolas Pitre <nico@cam.org> wrote:
> > 
> > > The hash in the code above has to do with the file names the
> > > corresponding objects are coming from.
> > 
> > So can we change this loop to exit after a max of window_size * 10 or
> > something like that iterations? Without capping it the threads become
> > way unbalanced in the end. In the gcc case one thread is continuing
> > 30+ minutes past the others exiting.
> 
> Indeed, some more tweaking are needed.
> 
> The object path distribution goes like this for the gcc repo:
> 
> 105557  gcc
> 42537   gcc/ChangeLog
> 25210   gcc/config
> 20690   gcc/testsuite
> 13434   gcc/testsuite/ChangeLog
> 12363   libstdc++-v3
> 9346    gcc/cp
> 8757    libstdc++-v3/include
> 8186    gcc/version.c
> 7867    gcc/cp/ChangeLog
> 7737    libstdc++-v3/include/bits
> 7653    libjava
> 6577    gcc/testsuite/gcc.dg
> 5942    libjava/ChangeLog
> 5351    gcc/config/i386
> 5260    gcc/testsuite/g++.dg
> 4451    gcc/f
> 4330    libstdc++-v3/ChangeLog
> 4321    libstdc++-v3/include/bits/c++config
> 4316    gcc/doc
> [...]
> 
> So... the top entries are most certainly going to create load balancing 
> issues if their path hash clustering isn't broken.

Here's the fix.

diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c
index 250dc56..7dd0d7f 100644
--- a/builtin-pack-objects.c
+++ b/builtin-pack-objects.c
@@ -1709,6 +1709,16 @@ static void ll_find_deltas(struct object_entry **list, unsigned list_size,
 				list++;
 				sub_size--;
 			}
+			if (!sub_size) {
+				/*
+				 * It is possible for some "paths" to have
+				 * so many objects that no hash boundary
+				 * might be found.  Let's just steal the
+				 * exact half in that case.
+				 */
+				sub_size = victim->remaining / 2;
+				list -= sub_size;
+			}
 			target->list = list;
 			victim->list_size -= sub_size;
 			victim->remaining -= sub_size;

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
  2007-12-08  5:03 [PATCH 2/2] pack-objects: fix threaded load balancing Nicolas Pitre
                   ` (2 preceding siblings ...)
  2007-12-10  4:30 ` Jon Smirl
@ 2007-12-11 17:02 ` Johannes Sixt
  2007-12-11 17:28   ` Nicolas Pitre
  3 siblings, 1 reply; 16+ messages in thread
From: Johannes Sixt @ 2007-12-11 17:02 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git, Jon Smirl

Nicolas Pitre schrieb:
> @@ -1612,10 +1620,10 @@ static void *threaded_find_deltas(void *arg)
>  		pthread_mutex_lock(&data_ready);
>  		pthread_mutex_unlock(&data_request);
>  
> -		if (!me->list_size)
> +		if (!me->remaining)
>  			return NULL;
>  
> -		find_deltas(me->list, me->list_size,
> +		find_deltas(me->list, &me->remaining,
>  			    me->window, me->depth, me->processed);
>  	}
>  }

This hunk caught my attention. &data_ready is locked, but not released in
this function.

Looking more closely at the code surrounding this hunk, it seems that the
lock is released in a *different* thread than the one that locked it. This
works on Linux, but is not portable. We will have to use condition variables
like every one else does in a producer-consumer-like scenario.

-- Hannes

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
  2007-12-11 17:02 ` [PATCH 2/2] pack-objects: fix threaded load balancing Johannes Sixt
@ 2007-12-11 17:28   ` Nicolas Pitre
  2007-12-13  7:15     ` Johannes Sixt
  0 siblings, 1 reply; 16+ messages in thread
From: Nicolas Pitre @ 2007-12-11 17:28 UTC (permalink / raw)
  To: Johannes Sixt; +Cc: Junio C Hamano, git, Jon Smirl

On Tue, 11 Dec 2007, Johannes Sixt wrote:

> Nicolas Pitre schrieb:
> > @@ -1612,10 +1620,10 @@ static void *threaded_find_deltas(void *arg)
> >  		pthread_mutex_lock(&data_ready);
> >  		pthread_mutex_unlock(&data_request);
> >  
> > -		if (!me->list_size)
> > +		if (!me->remaining)
> >  			return NULL;
> >  
> > -		find_deltas(me->list, me->list_size,
> > +		find_deltas(me->list, &me->remaining,
> >  			    me->window, me->depth, me->processed);
> >  	}
> >  }
> 
> This hunk caught my attention. &data_ready is locked, but not released in
> this function.
> 
> Looking more closely at the code surrounding this hunk, it seems that the
> lock is released in a *different* thread than the one that locked it. This
> works on Linux, but is not portable. We will have to use condition variables
> like every one else does in a producer-consumer-like scenario.

Are you willing to make a patch for it?


Nicolas

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] pack-objects: fix threaded load balancing
  2007-12-11 17:28   ` Nicolas Pitre
@ 2007-12-13  7:15     ` Johannes Sixt
  0 siblings, 0 replies; 16+ messages in thread
From: Johannes Sixt @ 2007-12-13  7:15 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git, Jon Smirl

Nicolas Pitre schrieb:
> On Tue, 11 Dec 2007, Johannes Sixt wrote:
> 
>> Nicolas Pitre schrieb:
>>> @@ -1612,10 +1620,10 @@ static void *threaded_find_deltas(void *arg)
>>>  		pthread_mutex_lock(&data_ready);
>>>  		pthread_mutex_unlock(&data_request);
>>>  
>>> -		if (!me->list_size)
>>> +		if (!me->remaining)
>>>  			return NULL;
>>>  
>>> -		find_deltas(me->list, me->list_size,
>>> +		find_deltas(me->list, &me->remaining,
>>>  			    me->window, me->depth, me->processed);
>>>  	}
>>>  }
>> This hunk caught my attention. &data_ready is locked, but not released in
>> this function.
>>
>> Looking more closely at the code surrounding this hunk, it seems that the
>> lock is released in a *different* thread than the one that locked it. This
>> works on Linux, but is not portable. We will have to use condition variables
>> like every one else does in a producer-consumer-like scenario.
> 
> Are you willing to make a patch for it?

I'm working on it, slowly.

-- Hannes

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2007-12-13  7:17 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-12-08  5:03 [PATCH 2/2] pack-objects: fix threaded load balancing Nicolas Pitre
2007-12-08  9:18 ` Jeff King
2007-12-10  4:10 ` Jon Smirl
2007-12-10  4:30 ` Jon Smirl
2007-12-10  5:23   ` Jon Smirl
2007-12-10  5:59     ` Jon Smirl
2007-12-10  6:06       ` Jon Smirl
2007-12-10  6:19         ` Jon Smirl
2007-12-10 16:03           ` Nicolas Pitre
2007-12-10 16:14         ` Nicolas Pitre
2007-12-10 17:06           ` Jon Smirl
2007-12-10 18:21             ` Nicolas Pitre
2007-12-10 19:19               ` [PATCH] pack-objects: more threaded load balancing fix with often changed paths Nicolas Pitre
2007-12-11 17:02 ` [PATCH 2/2] pack-objects: fix threaded load balancing Johannes Sixt
2007-12-11 17:28   ` Nicolas Pitre
2007-12-13  7:15     ` Johannes Sixt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).