* [PATCH 2/2] --pretty=format: on-demand format expansion
@ 2007-11-09 0:49 René Scharfe
2007-11-09 1:24 ` Johannes Schindelin
` (3 more replies)
0 siblings, 4 replies; 24+ messages in thread
From: René Scharfe @ 2007-11-09 0:49 UTC (permalink / raw)
To: Junio C Hamano
Cc: Paul Mackerras, Git Mailing List, Pierre Habouzit,
Johannes Schindelin
Some of the --pretty=format placeholders expansions are expensive to
calculate. This is made worse by the current code's use of
interpolate(), which requires _all_ placeholders are to be prepared
up front.
One way to speed this up is to check which placeholders are present
in the format string and to prepare only the expansions that are
needed. That still leaves the allocation overhead of interpolate().
Another way is to use a callback based approach together with the
strbuf library to keep allocations to a minimum and avoid string
copies. That's what this patch does. It introduces a new strbuf
function, strbuf_expand().
The function takes a format string, list of placeholder strings,
a user supplied function 'fn', and an opaque pointer 'context'
to tell 'fn' what thingy to operate on.
The function 'fn' is expected to accept a strbuf, a parsed
placeholder string and the 'context' pointer, and append the
interpolated value for the 'context' thingy, according to the
format specified by the placeholder.
Thanks to Pierre Habouzit for his suggestion to use strchrnul() and
the code surrounding its callsite. And thanks to Junio for most of
this commit message. :)
Here my measurements of most of Paul Mackerras' test cases that
highlighted the performance problem (best of three runs):
(master)
$ time git log --pretty=oneline >/dev/null
real 0m0.390s
user 0m0.340s
sys 0m0.040s
(master)
$ time git log --pretty=raw >/dev/null
real 0m0.434s
user 0m0.408s
sys 0m0.016s
(master)
$ time git log --pretty="format:%H {%P} %ct" >/dev/null
real 0m1.347s
user 0m0.080s
sys 0m1.256s
(interp_find_active -- Dscho)
$ time ./git log --pretty="format:%H {%P} %ct" >/dev/null
real 0m0.694s
user 0m0.020s
sys 0m0.672s
(strbuf_expand -- this patch)
$ time ./git log --pretty="format:%H {%P} %ct" >/dev/null
real 0m0.395s
user 0m0.352s
sys 0m0.028s
Signed-off-by: Rene Scharfe <rene.scharfe@lsrfire.ath.cx>
---
This version fixes a bug in the first one, which would eat
percent signs that were not followed by a placeholder from the
list. That may be a valid thing to do for a brand-new parser,
but it regressed on what interpolate() did.
strbuf.c | 24 ++++++
strbuf.h | 3 +
pretty.c | 276 ++++++++++++++++++++++++++++++++++----------------------------
3 files changed, 180 insertions(+), 123 deletions(-)
diff --git a/strbuf.c b/strbuf.c
index f4201e1..536b432 100644
--- a/strbuf.c
+++ b/strbuf.c
@@ -129,6 +129,30 @@ void strbuf_addf(struct strbuf *sb, const char *fmt, ...)
strbuf_setlen(sb, sb->len + len);
}
+void strbuf_expand(struct strbuf *sb, const char *format,
+ const char **placeholders, expand_fn_t fn, void *context)
+{
+ for (;;) {
+ const char *percent, **p;
+
+ percent = strchrnul(format, '%');
+ strbuf_add(sb, format, percent - format);
+ if (!*percent)
+ break;
+ format = percent + 1;
+
+ for (p = placeholders; *p; p++) {
+ if (!prefixcmp(format, *p))
+ break;
+ }
+ if (*p) {
+ fn(sb, *p, context);
+ format += strlen(*p);
+ } else
+ strbuf_addch(sb, '%');
+ }
+}
+
size_t strbuf_fread(struct strbuf *sb, size_t size, FILE *f)
{
size_t res;
diff --git a/strbuf.h b/strbuf.h
index cd7f295..21d8944 100644
--- a/strbuf.h
+++ b/strbuf.h
@@ -102,6 +102,9 @@ static inline void strbuf_addbuf(struct strbuf *sb, struct strbuf *sb2) {
strbuf_add(sb, sb2->buf, sb2->len);
}
+typedef void (*expand_fn_t) (struct strbuf *sb, const char *placeholder, void *context);
+extern void strbuf_expand(struct strbuf *sb, const char *format, const char **placeholders, expand_fn_t fn, void *context);
+
__attribute__((format(printf,2,3)))
extern void strbuf_addf(struct strbuf *sb, const char *fmt, ...);
diff --git a/pretty.c b/pretty.c
index 490cede..9fbd73f 100644
--- a/pretty.c
+++ b/pretty.c
@@ -1,6 +1,5 @@
#include "cache.h"
#include "commit.h"
-#include "interpolate.h"
#include "utf8.h"
#include "diff.h"
#include "revision.h"
@@ -283,7 +282,8 @@ static char *logmsg_reencode(const struct commit *commit,
return out;
}
-static void fill_person(struct interp *table, const char *msg, int len)
+static void format_person_part(struct strbuf *sb, char part,
+ const char *msg, int len)
{
int start, end, tz = 0;
unsigned long date;
@@ -295,7 +295,10 @@ static void fill_person(struct interp *table, const char *msg, int len)
start = end + 1;
while (end > 0 && isspace(msg[end - 1]))
end--;
- table[0].value = xmemdupz(msg, end);
+ if (part == 'n') { /* name */
+ strbuf_add(sb, msg, end);
+ return;
+ }
if (start >= len)
return;
@@ -307,7 +310,10 @@ static void fill_person(struct interp *table, const char *msg, int len)
if (end >= len)
return;
- table[1].value = xmemdupz(msg + start, end - start);
+ if (part == 'e') { /* email */
+ strbuf_add(sb, msg + start, end - start);
+ return;
+ }
/* parse date */
for (start = end + 1; start < len && isspace(msg[start]); start++)
@@ -318,7 +324,10 @@ static void fill_person(struct interp *table, const char *msg, int len)
if (msg + start == ep)
return;
- table[5].value = xmemdupz(msg + start, ep - (msg + start));
+ if (part == 't') { /* date, UNIX timestamp */
+ strbuf_add(sb, msg + start, ep - (msg + start));
+ return;
+ }
/* parse tz */
for (start = ep - msg + 1; start < len && isspace(msg[start]); start++)
@@ -329,123 +338,107 @@ static void fill_person(struct interp *table, const char *msg, int len)
tz = -tz;
}
- interp_set_entry(table, 2, show_date(date, tz, DATE_NORMAL));
- interp_set_entry(table, 3, show_date(date, tz, DATE_RFC2822));
- interp_set_entry(table, 4, show_date(date, tz, DATE_RELATIVE));
- interp_set_entry(table, 6, show_date(date, tz, DATE_ISO8601));
+ switch (part) {
+ case 'd': /* date */
+ strbuf_addstr(sb, show_date(date, tz, DATE_NORMAL));
+ return;
+ case 'D': /* date, RFC2822 style */
+ strbuf_addstr(sb, show_date(date, tz, DATE_RFC2822));
+ return;
+ case 'r': /* date, relative */
+ strbuf_addstr(sb, show_date(date, tz, DATE_RELATIVE));
+ return;
+ case 'i': /* date, ISO 8601 */
+ strbuf_addstr(sb, show_date(date, tz, DATE_ISO8601));
+ return;
+ }
}
-void format_commit_message(const struct commit *commit,
- const void *format, struct strbuf *sb)
+static void format_commit_item(struct strbuf *sb, const char *placeholder,
+ void *context)
{
- struct interp table[] = {
- { "%H" }, /* commit hash */
- { "%h" }, /* abbreviated commit hash */
- { "%T" }, /* tree hash */
- { "%t" }, /* abbreviated tree hash */
- { "%P" }, /* parent hashes */
- { "%p" }, /* abbreviated parent hashes */
- { "%an" }, /* author name */
- { "%ae" }, /* author email */
- { "%ad" }, /* author date */
- { "%aD" }, /* author date, RFC2822 style */
- { "%ar" }, /* author date, relative */
- { "%at" }, /* author date, UNIX timestamp */
- { "%ai" }, /* author date, ISO 8601 */
- { "%cn" }, /* committer name */
- { "%ce" }, /* committer email */
- { "%cd" }, /* committer date */
- { "%cD" }, /* committer date, RFC2822 style */
- { "%cr" }, /* committer date, relative */
- { "%ct" }, /* committer date, UNIX timestamp */
- { "%ci" }, /* committer date, ISO 8601 */
- { "%e" }, /* encoding */
- { "%s" }, /* subject */
- { "%b" }, /* body */
- { "%Cred" }, /* red */
- { "%Cgreen" }, /* green */
- { "%Cblue" }, /* blue */
- { "%Creset" }, /* reset color */
- { "%n" }, /* newline */
- { "%m" }, /* left/right/bottom */
- };
- enum interp_index {
- IHASH = 0, IHASH_ABBREV,
- ITREE, ITREE_ABBREV,
- IPARENTS, IPARENTS_ABBREV,
- IAUTHOR_NAME, IAUTHOR_EMAIL,
- IAUTHOR_DATE, IAUTHOR_DATE_RFC2822, IAUTHOR_DATE_RELATIVE,
- IAUTHOR_TIMESTAMP, IAUTHOR_ISO8601,
- ICOMMITTER_NAME, ICOMMITTER_EMAIL,
- ICOMMITTER_DATE, ICOMMITTER_DATE_RFC2822,
- ICOMMITTER_DATE_RELATIVE, ICOMMITTER_TIMESTAMP,
- ICOMMITTER_ISO8601,
- IENCODING,
- ISUBJECT,
- IBODY,
- IRED, IGREEN, IBLUE, IRESET_COLOR,
- INEWLINE,
- ILEFT_RIGHT,
- };
+ const struct commit *commit = context;
struct commit_list *p;
- char parents[1024];
- unsigned long len;
int i;
enum { HEADER, SUBJECT, BODY } state;
const char *msg = commit->buffer;
- if (ILEFT_RIGHT + 1 != ARRAY_SIZE(table))
- die("invalid interp table!");
-
/* these are independent of the commit */
- interp_set_entry(table, IRED, "\033[31m");
- interp_set_entry(table, IGREEN, "\033[32m");
- interp_set_entry(table, IBLUE, "\033[34m");
- interp_set_entry(table, IRESET_COLOR, "\033[m");
- interp_set_entry(table, INEWLINE, "\n");
+ switch (placeholder[0]) {
+ case 'C':
+ switch (placeholder[3]) {
+ case 'd': /* red */
+ strbuf_addstr(sb, "\033[31m");
+ return;
+ case 'e': /* green */
+ strbuf_addstr(sb, "\033[32m");
+ return;
+ case 'u': /* blue */
+ strbuf_addstr(sb, "\033[34m");
+ return;
+ case 's': /* reset color */
+ strbuf_addstr(sb, "\033[m");
+ return;
+ }
+ case 'n': /* newline */
+ strbuf_addch(sb, '\n');
+ return;
+ }
/* these depend on the commit */
if (!commit->object.parsed)
parse_object(commit->object.sha1);
- interp_set_entry(table, IHASH, sha1_to_hex(commit->object.sha1));
- interp_set_entry(table, IHASH_ABBREV,
- find_unique_abbrev(commit->object.sha1,
- DEFAULT_ABBREV));
- interp_set_entry(table, ITREE, sha1_to_hex(commit->tree->object.sha1));
- interp_set_entry(table, ITREE_ABBREV,
- find_unique_abbrev(commit->tree->object.sha1,
- DEFAULT_ABBREV));
- interp_set_entry(table, ILEFT_RIGHT,
- (commit->object.flags & BOUNDARY)
- ? "-"
- : (commit->object.flags & SYMMETRIC_LEFT)
- ? "<"
- : ">");
-
- parents[1] = 0;
- for (i = 0, p = commit->parents;
- p && i < sizeof(parents) - 1;
- p = p->next)
- i += snprintf(parents + i, sizeof(parents) - i - 1, " %s",
- sha1_to_hex(p->item->object.sha1));
- interp_set_entry(table, IPARENTS, parents + 1);
-
- parents[1] = 0;
- for (i = 0, p = commit->parents;
- p && i < sizeof(parents) - 1;
- p = p->next)
- i += snprintf(parents + i, sizeof(parents) - i - 1, " %s",
- find_unique_abbrev(p->item->object.sha1,
- DEFAULT_ABBREV));
- interp_set_entry(table, IPARENTS_ABBREV, parents + 1);
+ switch (placeholder[0]) {
+ case 'H': /* commit hash */
+ strbuf_addstr(sb, sha1_to_hex(commit->object.sha1));
+ return;
+ case 'h': /* abbreviated commit hash */
+ strbuf_addstr(sb, find_unique_abbrev(commit->object.sha1,
+ DEFAULT_ABBREV));
+ return;
+ case 'T': /* tree hash */
+ strbuf_addstr(sb, sha1_to_hex(commit->tree->object.sha1));
+ return;
+ case 't': /* abbreviated tree hash */
+ strbuf_addstr(sb, find_unique_abbrev(commit->tree->object.sha1,
+ DEFAULT_ABBREV));
+ return;
+ case 'P': /* parent hashes */
+ for (p = commit->parents; p; p = p->next) {
+ if (p != commit->parents)
+ strbuf_addch(sb, ' ');
+ strbuf_addstr(sb, sha1_to_hex(p->item->object.sha1));
+ }
+ return;
+ case 'p': /* abbreviated parent hashes */
+ for (p = commit->parents; p; p = p->next) {
+ if (p != commit->parents)
+ strbuf_addch(sb, ' ');
+ strbuf_addstr(sb, find_unique_abbrev(
+ p->item->object.sha1, DEFAULT_ABBREV));
+ }
+ return;
+ case 'm': /* left/right/bottom */
+ strbuf_addch(sb, (commit->object.flags & BOUNDARY)
+ ? '-'
+ : (commit->object.flags & SYMMETRIC_LEFT)
+ ? '<'
+ : '>');
+ return;
+ }
+
+ /* For the rest we have to parse the commit header. */
for (i = 0, state = HEADER; msg[i] && state < BODY; i++) {
int eol;
for (eol = i; msg[eol] && msg[eol] != '\n'; eol++)
; /* do nothing */
if (state == SUBJECT) {
- table[ISUBJECT].value = xmemdupz(msg + i, eol - i);
+ if (placeholder[0] == 's') {
+ strbuf_add(sb, msg + i, eol - i);
+ return;
+ }
i = eol;
}
if (i == eol) {
@@ -453,29 +446,66 @@ void format_commit_message(const struct commit *commit,
/* strip empty lines */
while (msg[eol + 1] == '\n')
eol++;
- } else if (!prefixcmp(msg + i, "author "))
- fill_person(table + IAUTHOR_NAME,
- msg + i + 7, eol - i - 7);
- else if (!prefixcmp(msg + i, "committer "))
- fill_person(table + ICOMMITTER_NAME,
- msg + i + 10, eol - i - 10);
- else if (!prefixcmp(msg + i, "encoding "))
- table[IENCODING].value =
- xmemdupz(msg + i + 9, eol - i - 9);
+ } else if (!prefixcmp(msg + i, "author ")) {
+ if (placeholder[0] == 'a') {
+ format_person_part(sb, placeholder[1],
+ msg + i + 7, eol - i - 7);
+ return;
+ }
+ } else if (!prefixcmp(msg + i, "committer ")) {
+ if (placeholder[0] == 'c') {
+ format_person_part(sb, placeholder[1],
+ msg + i + 10, eol - i - 10);
+ return;
+ }
+ } else if (!prefixcmp(msg + i, "encoding ")) {
+ if (placeholder[0] == 'e') {
+ strbuf_add(sb, msg + i + 9, eol - i - 9);
+ return;
+ }
+ }
i = eol;
}
- if (msg[i])
- table[IBODY].value = xstrdup(msg + i);
-
- len = interpolate(sb->buf + sb->len, strbuf_avail(sb),
- format, table, ARRAY_SIZE(table));
- if (len > strbuf_avail(sb)) {
- strbuf_grow(sb, len);
- interpolate(sb->buf + sb->len, strbuf_avail(sb) + 1,
- format, table, ARRAY_SIZE(table));
- }
- strbuf_setlen(sb, sb->len + len);
- interp_clear_table(table, ARRAY_SIZE(table));
+ if (msg[i] && placeholder[0] == 'b') /* body */
+ strbuf_addstr(sb, msg + i);
+}
+
+void format_commit_message(const struct commit *commit,
+ const void *format, struct strbuf *sb)
+{
+ const char *placeholders[] = {
+ "H", /* commit hash */
+ "h", /* abbreviated commit hash */
+ "T", /* tree hash */
+ "t", /* abbreviated tree hash */
+ "P", /* parent hashes */
+ "p", /* abbreviated parent hashes */
+ "an", /* author name */
+ "ae", /* author email */
+ "ad", /* author date */
+ "aD", /* author date, RFC2822 style */
+ "ar", /* author date, relative */
+ "at", /* author date, UNIX timestamp */
+ "ai", /* author date, ISO 8601 */
+ "cn", /* committer name */
+ "ce", /* committer email */
+ "cd", /* committer date */
+ "cD", /* committer date, RFC2822 style */
+ "cr", /* committer date, relative */
+ "ct", /* committer date, UNIX timestamp */
+ "ci", /* committer date, ISO 8601 */
+ "e", /* encoding */
+ "s", /* subject */
+ "b", /* body */
+ "Cred", /* red */
+ "Cgreen", /* green */
+ "Cblue", /* blue */
+ "Creset", /* reset color */
+ "n", /* newline */
+ "m", /* left/right/bottom */
+ NULL
+ };
+ strbuf_expand(sb, format, placeholders, format_commit_item, (void *)commit);
}
static void pp_header(enum cmit_fmt fmt,
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] --pretty=format: on-demand format expansion
2007-11-09 0:49 [PATCH 2/2] --pretty=format: on-demand format expansion René Scharfe
@ 2007-11-09 1:24 ` Johannes Schindelin
2007-11-09 21:13 ` René Scharfe
2007-11-09 4:50 ` Jeff King
` (2 subsequent siblings)
3 siblings, 1 reply; 24+ messages in thread
From: Johannes Schindelin @ 2007-11-09 1:24 UTC (permalink / raw)
To: René Scharfe
Cc: Junio C Hamano, Paul Mackerras, Git Mailing List, Pierre Habouzit
Hi,
On Fri, 9 Nov 2007, Ren? Scharfe wrote:
> strbuf.c | 24 ++++++
> strbuf.h | 3 +
> pretty.c | 276 ++++++++++++++++++++++++++++++++++----------------------------
I would be so grateful if you could (trivially) split up this patch into
the addition of strbuf_expend() (with a small example in the commit
message), and a patch that uses it in pretty.c.
Thanks,
Dscho
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] --pretty=format: on-demand format expansion
2007-11-09 0:49 [PATCH 2/2] --pretty=format: on-demand format expansion René Scharfe
2007-11-09 1:24 ` Johannes Schindelin
@ 2007-11-09 4:50 ` Jeff King
2007-11-09 23:16 ` René Scharfe
2007-11-09 4:52 ` [PATCH 2/2] --pretty=format: on-demand format expansion Jeff King
2007-11-09 23:39 ` Paul Mackerras
3 siblings, 1 reply; 24+ messages in thread
From: Jeff King @ 2007-11-09 4:50 UTC (permalink / raw)
To: René Scharfe
Cc: Junio C Hamano, Paul Mackerras, Git Mailing List, Pierre Habouzit,
Johannes Schindelin
On Fri, Nov 09, 2007 at 01:49:42AM +0100, René Scharfe wrote:
> Another way is to use a callback based approach together with the
> strbuf library to keep allocations to a minimum and avoid string
> copies. That's what this patch does. It introduces a new strbuf
> function, strbuf_expand().
I think this is definitely the right approach, but there are some format
strings where the performance will be worse. Specifically:
- formatting expensive items multiple times will incur work
proportional to the number of times the item is used (in the old
code, it was calculated just once). e.g., "%h%h%h%h"
- formatting some items goes to some work that can be re-used by other
items (e.g., %ad and %ar both need to parse the author date)
And we could obviously overcome both by caching the results of expensive
operations. I'm not sure if these will be a problem in practice. For
the first one, the new code is so much faster that I needed to do
git-log --pretty=format:%h%h%h%h%h%h%h%h
to get a performance regression from the old code, which seems rather
unlikely. For the second, it is easy to imagine multiple "person" items
being used together, although their cost to produce is not all that
high. It looks like about .05 seconds to parse a date (over all commits
in git.git):
$ time ./git-log --pretty='format:' >/dev/null
real 0m0.441s
user 0m0.424s
sys 0m0.004s
$ time ./git-log --pretty='format:%ad' >/dev/null
real 0m0.477s
user 0m0.472s
sys 0m0.000s
$ time ./git-log --pretty='format:%ad %aD' >/dev/null
real 0m0.527s
user 0m0.520s
sys 0m0.004s
where the last two could probably end up costing about the same if we cached
the author parsing (but the caching will have a cost, too, so it might not end
up being a big win).
So it might make sense to cache some items as we figure them out. This
should be done by the calling code and not by strbuf_expand (since it
doesn't know which things are worth caching (and for fast things,
allocating memory for a cache entry is likely to be slower), or how the
expansions relate to each other).
A partial patch on top of yours is below (it caches commit and tree
abbreviations; parent abbreviations and person-parsing are probably
worth doing). Some timings:
Null format (these are average-looking runs; the differences got lost
in the noise):
# your patch
$ time git-log --pretty=format: >/dev/null
real 0m0.409s
user 0m0.384s
sys 0m0.012s
# with my patch
$ time ./git-log --pretty=format: >/dev/null
real 0m0.413s
user 0m0.404s
sys 0m0.004s
Single abbrev lookup (mine should be slightly slower because of
malloc/free of cache):
# your patch
$ time git-log --pretty=format:%h >/dev/null
real 0m0.536s
user 0m0.456s
sys 0m0.080s
# with my patch
$ time ./git-log --pretty=format:%h >/dev/null
real 0m0.553s
user 0m0.464s
sys 0m0.088s
Two abbrev lookups (I win by a little bit, but definitely not lost in
the noise):
# your patch
$ time git-log --pretty=format:%h%h >/dev/null
real 0m0.671s
user 0m0.496s
sys 0m0.144s
# my patch
$ time ./git-log --pretty=format:%h%h >/dev/null
real 0m0.567s
user 0m0.480s
sys 0m0.080s
And of course I can make pathological cases where mine wins hands down
(on "%h%h%h%h%h%h%h%h", mine stays the same but yours bumps to 1.2s).
So I think this is probably worth doing. Even if doubled work isn't the
common case,
1. It doesn't hurt the common case much at all (I think on average it
is slower, but the timings were totally lost in the noise)
2. It has a measurable impact on reasonable cases (like just using an
expensive substitution twice)
3. It has a huge impact on pathological cases (though I'm not sure we
care about those that much)
4. It's very little extra code, and it should be obvious to read. It
also documents the technique for other users of strbuf_expand,
where the "doubled" cases may be more common.
-Peff
---
pretty.c | 47 ++++++++++++++++++++++++++++++++++++++++-------
1 files changed, 40 insertions(+), 7 deletions(-)
diff --git a/pretty.c b/pretty.c
index 9fbd73f..8ae6fdd 100644
--- a/pretty.c
+++ b/pretty.c
@@ -282,6 +282,27 @@ static char *logmsg_reencode(const struct commit *commit,
return out;
}
+struct pretty_context {
+ const struct commit *commit;
+ char *abbrev_commit;
+ char *abbrev_tree;
+};
+
+static void pretty_context_init(struct pretty_context *p, const struct commit *c)
+{
+ p->commit = c;
+ p->abbrev_commit = NULL;
+ p->abbrev_tree = NULL;
+}
+
+static void pretty_context_free(struct pretty_context *p)
+{
+ if (p->abbrev_commit)
+ free(p->abbrev_commit);
+ if (p->abbrev_tree)
+ free(p->abbrev_tree);
+}
+
static void format_person_part(struct strbuf *sb, char part,
const char *msg, int len)
{
@@ -355,9 +376,10 @@ static void format_person_part(struct strbuf *sb, char part,
}
static void format_commit_item(struct strbuf *sb, const char *placeholder,
- void *context)
+ void *vcontext)
{
- const struct commit *commit = context;
+ struct pretty_context *context = vcontext;
+ const struct commit *commit = context->commit;
struct commit_list *p;
int i;
enum { HEADER, SUBJECT, BODY } state;
@@ -394,15 +416,23 @@ static void format_commit_item(struct strbuf *sb, const char *placeholder,
strbuf_addstr(sb, sha1_to_hex(commit->object.sha1));
return;
case 'h': /* abbreviated commit hash */
- strbuf_addstr(sb, find_unique_abbrev(commit->object.sha1,
- DEFAULT_ABBREV));
+ if (!context->abbrev_commit)
+ context->abbrev_commit = xstrdup(
+ find_unique_abbrev(
+ commit->object.sha1,
+ DEFAULT_ABBREV));
+ strbuf_addstr(sb, context->abbrev_commit);
return;
case 'T': /* tree hash */
strbuf_addstr(sb, sha1_to_hex(commit->tree->object.sha1));
return;
case 't': /* abbreviated tree hash */
- strbuf_addstr(sb, find_unique_abbrev(commit->tree->object.sha1,
- DEFAULT_ABBREV));
+ if (!context->abbrev_tree)
+ context->abbrev_tree = xstrdup(
+ find_unique_abbrev(
+ commit->tree->object.sha1,
+ DEFAULT_ABBREV));
+ strbuf_addstr(sb, context->abbrev_tree);
return;
case 'P': /* parent hashes */
for (p = commit->parents; p; p = p->next) {
@@ -505,7 +535,10 @@ void format_commit_message(const struct commit *commit,
"m", /* left/right/bottom */
NULL
};
- strbuf_expand(sb, format, placeholders, format_commit_item, (void *)commit);
+ struct pretty_context context;
+ pretty_context_init(&context, commit);
+ strbuf_expand(sb, format, placeholders, format_commit_item, &context);
+ pretty_context_free(&context);
}
static void pp_header(enum cmit_fmt fmt,
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] --pretty=format: on-demand format expansion
2007-11-09 0:49 [PATCH 2/2] --pretty=format: on-demand format expansion René Scharfe
2007-11-09 1:24 ` Johannes Schindelin
2007-11-09 4:50 ` Jeff King
@ 2007-11-09 4:52 ` Jeff King
2007-11-09 23:20 ` René Scharfe
2007-11-09 23:39 ` Paul Mackerras
3 siblings, 1 reply; 24+ messages in thread
From: Jeff King @ 2007-11-09 4:52 UTC (permalink / raw)
To: René Scharfe
Cc: Junio C Hamano, Paul Mackerras, Git Mailing List, Pierre Habouzit,
Johannes Schindelin
On Fri, Nov 09, 2007 at 01:49:42AM +0100, René Scharfe wrote:
> + strbuf_expand(sb, format, placeholders, format_commit_item, (void *)commit);
This void cast is pointless, since all pointers types convert implicitly
to void pointers anyway. At best, it does nothing, and at worst, it
hides an actual type error if the function signature or the type of
'commit' change.
(In the patch I just sent out, I had to change this line anyway, and
removed the cast).
-Peff
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] --pretty=format: on-demand format expansion
2007-11-09 1:24 ` Johannes Schindelin
@ 2007-11-09 21:13 ` René Scharfe
2007-11-09 22:18 ` Johannes Schindelin
0 siblings, 1 reply; 24+ messages in thread
From: René Scharfe @ 2007-11-09 21:13 UTC (permalink / raw)
To: Johannes Schindelin
Cc: Junio C Hamano, Paul Mackerras, Git Mailing List, Pierre Habouzit
Johannes Schindelin schrieb:
> Hi,
>
> On Fri, 9 Nov 2007, Ren? Scharfe wrote:
>
>> strbuf.c | 24 ++++++
>> strbuf.h | 3 +
>> pretty.c | 276 ++++++++++++++++++++++++++++++++++----------------------------
>
> I would be so grateful if you could (trivially) split up this patch into
> the addition of strbuf_expend() (with a small example in the commit
> message), and a patch that uses it in pretty.c.
Makes sense. Will do next time.
Thanks,
René
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] --pretty=format: on-demand format expansion
2007-11-09 21:13 ` René Scharfe
@ 2007-11-09 22:18 ` Johannes Schindelin
2007-11-09 22:30 ` Junio C Hamano
0 siblings, 1 reply; 24+ messages in thread
From: Johannes Schindelin @ 2007-11-09 22:18 UTC (permalink / raw)
To: René Scharfe
Cc: Junio C Hamano, Paul Mackerras, Git Mailing List, Pierre Habouzit
Hi,
On Fri, 9 Nov 2007, Ren? Scharfe wrote:
> Johannes Schindelin schrieb:
> > Hi,
> >
> > On Fri, 9 Nov 2007, Ren? Scharfe wrote:
> >
> >> strbuf.c | 24 ++++++
> >> strbuf.h | 3 +
> >> pretty.c | 276 ++++++++++++++++++++++++++++++++++----------------------------
> >
> > I would be so grateful if you could (trivially) split up this patch into
> > the addition of strbuf_expend() (with a small example in the commit
> > message), and a patch that uses it in pretty.c.
>
> Makes sense. Will do next time.
You mean next time you write strbuf_expand()?
;-) I saw that Junio already applied your patch as is. Hmm.
Ciao,
Dscho
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] --pretty=format: on-demand format expansion
2007-11-09 22:18 ` Johannes Schindelin
@ 2007-11-09 22:30 ` Junio C Hamano
0 siblings, 0 replies; 24+ messages in thread
From: Junio C Hamano @ 2007-11-09 22:30 UTC (permalink / raw)
To: Johannes Schindelin
Cc: René Scharfe, Paul Mackerras, Git Mailing List,
Pierre Habouzit
Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
> You mean next time you write strbuf_expand()?
>
> ;-) I saw that Junio already applied your patch as is. Hmm.
Which only means we expect incremental improvements on top of it
from now on, it does not mean "it is perfect and cast in stone".
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] --pretty=format: on-demand format expansion
2007-11-09 4:50 ` Jeff King
@ 2007-11-09 23:16 ` René Scharfe
2007-11-10 0:31 ` Johannes Schindelin
2007-11-10 0:46 ` Jeff King
0 siblings, 2 replies; 24+ messages in thread
From: René Scharfe @ 2007-11-09 23:16 UTC (permalink / raw)
To: Jeff King
Cc: Junio C Hamano, Paul Mackerras, Git Mailing List, Pierre Habouzit,
Johannes Schindelin
Jeff King schrieb:
> - formatting expensive items multiple times will incur work
> proportional to the number of times the item is used (in the old
> code, it was calculated just once). e.g., "%h%h%h%h"
>
> - formatting some items goes to some work that can be re-used by
> other items (e.g., %ad and %ar both need to parse the author date)
>
> And we could obviously overcome both by caching the results of
> expensive operations. I'm not sure if these will be a problem in
> practice. For the first one, the new code is so much faster that I
> needed to do
>
> git-log --pretty=format:%h%h%h%h%h%h%h%h
>
> to get a performance regression from the old code, which seems rather
> unlikely. For the second, it is easy to imagine multiple "person"
> items being used together, although their cost to produce is not all
> that high. It looks like about .05 seconds to parse a date (over all
> commits in git.git):
>
> $ time ./git-log --pretty='format:' >/dev/null real 0m0.441s user
> 0m0.424s sys 0m0.004s
>
> $ time ./git-log --pretty='format:%ad' >/dev/null real 0m0.477s
> user 0m0.472s sys 0m0.000s
>
> $ time ./git-log --pretty='format:%ad %aD' >/dev/null real
> 0m0.527s user 0m0.520s sys 0m0.004s
>
> where the last two could probably end up costing about the same if we
> cached the author parsing (but the caching will have a cost, too, so
> it might not end up being a big win).
>
> So it might make sense to cache some items as we figure them out.
> This should be done by the calling code and not by strbuf_expand
> (since it doesn't know which things are worth caching (and for fast
> things, allocating memory for a cache entry is likely to be slower),
> or how the expansions relate to each other).
I fully agree with the above.
> A partial patch on top of yours is below (it caches commit and tree
> abbreviations; parent abbreviations and person-parsing are probably
> worth doing). Some timings:
... but I object to the choice of items to cache. Are there real-world
formats containing the same placeholder twice or even more often?
There is probably more to gain from the interdependencies of different
placeholders. The patch below attempts to avoid parsing the commit
twice, by saving pointers to the different parts.
(next)
$ time git log --pretty=format:"* %cd %cn%n%n%s%n%b" >/dev/null
real 0m0.631s
user 0m0.584s
sys 0m0.040s
(next+patch)
$ time git log --pretty=format:"* %cd %cn%n%n%s%n%b" >/dev/null
real 0m0.570s
user 0m0.512s
sys 0m0.044s
The format is from http://svn.tue.mpg.de/tentakel/trunk/tentakel/Makefile
which I found while googling for real-world use cases.
While the code size is increased by the patch, most of the code gets
simplified.
pretty.c | 128 ++++++++++++++++++++++++++++++++++++++++++---------------------
1 file changed, 86 insertions(+), 42 deletions(-)
diff --git a/pretty.c b/pretty.c
index 9fbd73f..b2f10a3 100644
--- a/pretty.c
+++ b/pretty.c
@@ -354,14 +354,69 @@ static void format_person_part(struct strbuf *sb, char part,
}
}
+struct parsed_commit_header {
+ int parsed;
+ const char *subject_start;
+ const char *author_start;
+ const char *committer_start;
+ const char *encoding_start;
+ const char *body_start;
+ size_t subject_len;
+ size_t author_len;
+ size_t committer_len;
+ size_t encoding_len;
+};
+
+struct format_commit_context {
+ const struct commit *commit;
+ struct parsed_commit_header *parsed_commit_header;
+};
+
+static void parse_commit_header(struct parsed_commit_header *pch,
+ const struct commit *commit)
+{
+ const char *msg = commit->buffer;
+ int i;
+ enum { HEADER, SUBJECT, BODY } state;
+
+ for (i = 0, state = HEADER; msg[i] && state < BODY; i++) {
+ int eol;
+ for (eol = i; msg[eol] && msg[eol] != '\n'; eol++)
+ ; /* do nothing */
+
+ if (state == SUBJECT) {
+ pch->subject_start = msg + i;
+ pch->subject_len = eol - i;
+ i = eol;
+ }
+ if (i == eol) {
+ state++;
+ /* strip empty lines */
+ while (msg[eol + 1] == '\n')
+ eol++;
+ } else if (!prefixcmp(msg + i, "author ")) {
+ pch->author_start = msg + i + 7;
+ pch->author_len = eol - i - 7;
+ } else if (!prefixcmp(msg + i, "committer ")) {
+ pch->committer_start = msg + i + 10;
+ pch->committer_len = eol - i - 10;
+ } else if (!prefixcmp(msg + i, "encoding ")) {
+ pch->encoding_start = msg + i + 9;
+ pch->encoding_len = eol - i - 9;
+ }
+ i = eol;
+ }
+ pch->body_start = msg + i;
+ pch->parsed = 1;
+}
+
static void format_commit_item(struct strbuf *sb, const char *placeholder,
void *context)
{
- const struct commit *commit = context;
+ struct format_commit_context *c = context;
+ const struct commit *commit = c->commit;
+ struct parsed_commit_header *pch = c->parsed_commit_header;
struct commit_list *p;
- int i;
- enum { HEADER, SUBJECT, BODY } state;
- const char *msg = commit->buffer;
/* these are independent of the commit */
switch (placeholder[0]) {
@@ -429,45 +484,28 @@ static void format_commit_item(struct strbuf *sb, const char *placeholder,
}
/* For the rest we have to parse the commit header. */
- for (i = 0, state = HEADER; msg[i] && state < BODY; i++) {
- int eol;
- for (eol = i; msg[eol] && msg[eol] != '\n'; eol++)
- ; /* do nothing */
+ if (!pch->parsed)
+ parse_commit_header(pch, commit);
- if (state == SUBJECT) {
- if (placeholder[0] == 's') {
- strbuf_add(sb, msg + i, eol - i);
- return;
- }
- i = eol;
- }
- if (i == eol) {
- state++;
- /* strip empty lines */
- while (msg[eol + 1] == '\n')
- eol++;
- } else if (!prefixcmp(msg + i, "author ")) {
- if (placeholder[0] == 'a') {
- format_person_part(sb, placeholder[1],
- msg + i + 7, eol - i - 7);
- return;
- }
- } else if (!prefixcmp(msg + i, "committer ")) {
- if (placeholder[0] == 'c') {
- format_person_part(sb, placeholder[1],
- msg + i + 10, eol - i - 10);
- return;
- }
- } else if (!prefixcmp(msg + i, "encoding ")) {
- if (placeholder[0] == 'e') {
- strbuf_add(sb, msg + i + 9, eol - i - 9);
- return;
- }
- }
- i = eol;
+ switch (placeholder[0]) {
+ case 's':
+ strbuf_add(sb, pch->subject_start, pch->subject_len);
+ return;
+ case 'a':
+ format_person_part(sb, placeholder[1],
+ pch->author_start, pch->author_len);
+ return;
+ case 'c':
+ format_person_part(sb, placeholder[1],
+ pch->committer_start, pch->committer_len);
+ return;
+ case 'e':
+ strbuf_add(sb, pch->encoding_start, pch->encoding_len);
+ return;
+ case 'b':
+ strbuf_addstr(sb, pch->body_start);
+ return;
}
- if (msg[i] && placeholder[0] == 'b') /* body */
- strbuf_addstr(sb, msg + i);
}
void format_commit_message(const struct commit *commit,
@@ -505,7 +543,13 @@ void format_commit_message(const struct commit *commit,
"m", /* left/right/bottom */
NULL
};
- strbuf_expand(sb, format, placeholders, format_commit_item, (void *)commit);
+ struct parsed_commit_header pch;
+ struct format_commit_context context;
+
+ memset(&pch, 0, sizeof(pch));
+ context.commit = commit;
+ context.parsed_commit_header = &pch;
+ strbuf_expand(sb, format, placeholders, format_commit_item, &context);
}
static void pp_header(enum cmit_fmt fmt,
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] --pretty=format: on-demand format expansion
2007-11-09 4:52 ` [PATCH 2/2] --pretty=format: on-demand format expansion Jeff King
@ 2007-11-09 23:20 ` René Scharfe
2007-11-10 0:51 ` Jeff King
0 siblings, 1 reply; 24+ messages in thread
From: René Scharfe @ 2007-11-09 23:20 UTC (permalink / raw)
To: Jeff King
Cc: Junio C Hamano, Paul Mackerras, Git Mailing List, Pierre Habouzit,
Johannes Schindelin
Jeff King schrieb:
> On Fri, Nov 09, 2007 at 01:49:42AM +0100, René Scharfe wrote:
>
>> + strbuf_expand(sb, format, placeholders, format_commit_item, (void *)commit);
>
> This void cast is pointless, since all pointers types convert implicitly
> to void pointers anyway. At best, it does nothing, and at worst, it
> hides an actual type error if the function signature or the type of
> 'commit' change.
When commit (of type const struct commit*) is implicitly converted to
void *, gcc complains because the "const" qualifier is silently dropped.
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] --pretty=format: on-demand format expansion
2007-11-09 0:49 [PATCH 2/2] --pretty=format: on-demand format expansion René Scharfe
` (2 preceding siblings ...)
2007-11-09 4:52 ` [PATCH 2/2] --pretty=format: on-demand format expansion Jeff King
@ 2007-11-09 23:39 ` Paul Mackerras
3 siblings, 0 replies; 24+ messages in thread
From: Paul Mackerras @ 2007-11-09 23:39 UTC (permalink / raw)
To: René Scharfe
Cc: Junio C Hamano, Git Mailing List, Pierre Habouzit,
Johannes Schindelin
Thanks for doing this.
Could I ask for a couple more things? I would find it useful to have
format codes for the number of characters in the commit body. Having
a code for the number of bytes in the commit body would also be
useful. When reading commits in with Tcl, the number of characters is
more useful, but I can use the number of bytes (and I imagine others
would find that useful).
Paul.
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] --pretty=format: on-demand format expansion
2007-11-09 23:16 ` René Scharfe
@ 2007-11-10 0:31 ` Johannes Schindelin
2007-11-10 0:49 ` Jeff King
2007-11-10 0:46 ` Jeff King
1 sibling, 1 reply; 24+ messages in thread
From: Johannes Schindelin @ 2007-11-10 0:31 UTC (permalink / raw)
To: René Scharfe
Cc: Jeff King, Junio C Hamano, Paul Mackerras, Git Mailing List,
Pierre Habouzit
Hi,
On Sat, 10 Nov 2007, Ren? Scharfe wrote:
> (next)
> $ time git log --pretty=format:"* %cd %cn%n%n%s%n%b" >/dev/null
>
> real 0m0.631s
> user 0m0.584s
> sys 0m0.040s
>
> (next+patch)
> $ time git log --pretty=format:"* %cd %cn%n%n%s%n%b" >/dev/null
>
> real 0m0.570s
> user 0m0.512s
> sys 0m0.044s
Wow.
If you keep going like that, "git log" will be slower than "git log
--pretty=format:bla" soon.
Ciao,
Dscho
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] --pretty=format: on-demand format expansion
2007-11-09 23:16 ` René Scharfe
2007-11-10 0:31 ` Johannes Schindelin
@ 2007-11-10 0:46 ` Jeff King
2007-11-10 11:12 ` René Scharfe
` (3 more replies)
1 sibling, 4 replies; 24+ messages in thread
From: Jeff King @ 2007-11-10 0:46 UTC (permalink / raw)
To: René Scharfe
Cc: Junio C Hamano, Paul Mackerras, Git Mailing List, Pierre Habouzit,
Johannes Schindelin
On Sat, Nov 10, 2007 at 12:16:30AM +0100, René Scharfe wrote:
> > A partial patch on top of yours is below (it caches commit and tree
> > abbreviations; parent abbreviations and person-parsing are probably
> > worth doing). Some timings:
>
> ... but I object to the choice of items to cache. Are there real-world
> formats containing the same placeholder twice or even more often?
My choice of items was more "here is what I am talking about" and not
"this is the best set of items."
As for what real-world workloads are like, part of the _point_ of
--pretty=format: is for one-off formats that users use in their
workflow. So yes, I have used formats that repeat specifiers, but they
are probably not common.
The point of my timings was to show not only that we sped up that
uncommon case, but that there was negligible cost to the common case.
And since we don't know what formats users will provide, it makes sense
not to have lousy performance on the uncommon.
> There is probably more to gain from the interdependencies of different
> placeholders. The patch below attempts to avoid parsing the commit
> twice, by saving pointers to the different parts.
Looks sane, although I don't see any reason this couldn't just go on top
of my patch, and then we can speed up both cases.
> (next)
> $ time git log --pretty=format:"* %cd %cn%n%n%s%n%b" >/dev/null
> real 0m0.631s
[...]
> (next+patch)
> $ time git log --pretty=format:"* %cd %cn%n%n%s%n%b" >/dev/null
> real 0m0.570s
Great, you have sped up the uncommon case. But what is the cost to "git
log --pretty=format:" and "git log --pretty=format:%cd"?
-Peff
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] --pretty=format: on-demand format expansion
2007-11-10 0:31 ` Johannes Schindelin
@ 2007-11-10 0:49 ` Jeff King
0 siblings, 0 replies; 24+ messages in thread
From: Jeff King @ 2007-11-10 0:49 UTC (permalink / raw)
To: Johannes Schindelin
Cc: René Scharfe, Junio C Hamano, Paul Mackerras,
Git Mailing List, Pierre Habouzit
On Sat, Nov 10, 2007 at 12:31:41AM +0000, Johannes Schindelin wrote:
> If you keep going like that, "git log" will be slower than "git log
> --pretty=format:bla" soon.
Maybe I am dreaming, but the code might be much simpler and more
readable if --pretty=* were implemented in terms of --pretty=format,
assuming the speeds are equivalent.
-Peff
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] --pretty=format: on-demand format expansion
2007-11-09 23:20 ` René Scharfe
@ 2007-11-10 0:51 ` Jeff King
0 siblings, 0 replies; 24+ messages in thread
From: Jeff King @ 2007-11-10 0:51 UTC (permalink / raw)
To: René Scharfe
Cc: Junio C Hamano, Paul Mackerras, Git Mailing List, Pierre Habouzit,
Johannes Schindelin
On Sat, Nov 10, 2007 at 12:20:22AM +0100, René Scharfe wrote:
> > This void cast is pointless, since all pointers types convert implicitly
> > to void pointers anyway. At best, it does nothing, and at worst, it
> > hides an actual type error if the function signature or the type of
> > 'commit' change.
>
> When commit (of type const struct commit*) is implicitly converted to
> void *, gcc complains because the "const" qualifier is silently dropped.
Bleh. You're right, of course. I don't like casting away constness, but
it is stupid for strbuf_expand to take a "const void *" since it has no
idea how it will be used (although I note your caching patch also neatly
does away with this, anyway).
-Peff
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] --pretty=format: on-demand format expansion
2007-11-10 0:46 ` Jeff King
@ 2007-11-10 11:12 ` René Scharfe
2007-11-10 16:07 ` Johannes Schindelin
2007-11-10 20:34 ` Jeff King
2007-11-10 11:14 ` [PATCH 1/3] --pretty=format: parse commit message only once René Scharfe
` (2 subsequent siblings)
3 siblings, 2 replies; 24+ messages in thread
From: René Scharfe @ 2007-11-10 11:12 UTC (permalink / raw)
To: Jeff King
Cc: Junio C Hamano, Paul Mackerras, Git Mailing List, Pierre Habouzit,
Johannes Schindelin
Jeff King schrieb:
> On Sat, Nov 10, 2007 at 12:16:30AM +0100, René Scharfe wrote:
>
>>> A partial patch on top of yours is below (it caches commit and tree
>>> abbreviations; parent abbreviations and person-parsing are probably
>>> worth doing). Some timings:
>> ... but I object to the choice of items to cache. Are there real-world
>> formats containing the same placeholder twice or even more often?
>
> My choice of items was more "here is what I am talking about" and not
> "this is the best set of items."
>
> As for what real-world workloads are like, part of the _point_ of
> --pretty=format: is for one-off formats that users use in their
> workflow. So yes, I have used formats that repeat specifiers, but they
> are probably not common.
Hmm, OK.
> The point of my timings was to show not only that we sped up that
> uncommon case, but that there was negligible cost to the common case.
> And since we don't know what formats users will provide, it makes sense
> not to have lousy performance on the uncommon.
My suspicion was that it's more like "never used" rather than
"uncommon". Since you're a duplicate placeholder user, it's not "never"
any more. :)
>> There is probably more to gain from the interdependencies of different
>> placeholders. The patch below attempts to avoid parsing the commit
>> twice, by saving pointers to the different parts.
>
> Looks sane, although I don't see any reason this couldn't just go on top
> of my patch, and then we can speed up both cases.
Yes, the two are independent. I don't like the malloc()'s in your
patch, though, and have cooked up a different one on top of a cleaned up
version of mine. It plays the dirty trick of reading expansions of
repeated placeholders from the strbuf..
René
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH 1/3] --pretty=format: parse commit message only once
2007-11-10 0:46 ` Jeff King
2007-11-10 11:12 ` René Scharfe
@ 2007-11-10 11:14 ` René Scharfe
2007-11-10 11:16 ` [PATCH 2/3] add strbuf_adddup() René Scharfe
2007-11-10 11:18 ` [PATCH 3/3] --format=pretty: avoid calculating expensive expansions twice René Scharfe
3 siblings, 0 replies; 24+ messages in thread
From: René Scharfe @ 2007-11-10 11:14 UTC (permalink / raw)
To: Jeff King, Junio C Hamano
Cc: Paul Mackerras, Git Mailing List, Pierre Habouzit,
Johannes Schindelin
As Jeff King pointed out, some placeholder expansions are related to
each other: the steps to calculate one go most of the way towards
calculating the other, too.
This patch makes format_commit_message() parse the commit message
only once, remembering the position of each item. This speeds up
handling of format strings containing multiple placeholders from the
set %s, %a*, %c*, %e, %b.
Here are the timings for the git version in next. The first one is
to estimate the overhead of the caching, the second one is taken
from http://svn.tue.mpg.de/tentakel/trunk/tentakel/Makefile as an
example of a format string found in the wild. The times are the
fastest of three consecutive runs in each case:
$ time git log --pretty=format:%e >/dev/null
real 0m0.381s
user 0m0.340s
sys 0m0.024s
$ time git log --pretty=format:"* %cd %cn%n%n%s%n%b" >/dev/null
real 0m0.623s
user 0m0.556s
sys 0m0.052s
And here the times with this patch:
$ time git log --pretty=format:%e >/dev/null
real 0m0.385s
user 0m0.332s
sys 0m0.040s
$ time git log --pretty=format:"* %cd %cn%n%n%s%n%b" >/dev/null
real 0m0.563s
user 0m0.504s
sys 0m0.048s
Signed-off-by: Rene Scharfe <rene.scharfe@lsrfire.ath.cx>
---
pretty.c | 124 +++++++++++++++++++++++++++++++++++++++++---------------------
1 files changed, 82 insertions(+), 42 deletions(-)
diff --git a/pretty.c b/pretty.c
index 9fbd73f..0c2f83b 100644
--- a/pretty.c
+++ b/pretty.c
@@ -354,14 +354,67 @@ static void format_person_part(struct strbuf *sb, char part,
}
}
-static void format_commit_item(struct strbuf *sb, const char *placeholder,
- void *context)
+struct chunk {
+ size_t off;
+ size_t len;
+};
+
+struct format_commit_context {
+ const struct commit *commit;
+
+ /* These offsets are relative to the start of the commit message. */
+ int commit_header_parsed;
+ struct chunk subject;
+ struct chunk author;
+ struct chunk committer;
+ struct chunk encoding;
+ size_t body_off;
+};
+
+static void parse_commit_header(struct format_commit_context *context)
{
- const struct commit *commit = context;
- struct commit_list *p;
+ const char *msg = context->commit->buffer;
int i;
enum { HEADER, SUBJECT, BODY } state;
+
+ for (i = 0, state = HEADER; msg[i] && state < BODY; i++) {
+ int eol;
+ for (eol = i; msg[eol] && msg[eol] != '\n'; eol++)
+ ; /* do nothing */
+
+ if (state == SUBJECT) {
+ context->subject.off = i;
+ context->subject.len = eol - i;
+ i = eol;
+ }
+ if (i == eol) {
+ state++;
+ /* strip empty lines */
+ while (msg[eol + 1] == '\n')
+ eol++;
+ } else if (!prefixcmp(msg + i, "author ")) {
+ context->author.off = i + 7;
+ context->author.len = eol - i - 7;
+ } else if (!prefixcmp(msg + i, "committer ")) {
+ context->committer.off = i + 10;
+ context->committer.len = eol - i - 10;
+ } else if (!prefixcmp(msg + i, "encoding ")) {
+ context->encoding.off = i + 9;
+ context->encoding.len = eol - i - 9;
+ }
+ i = eol;
+ }
+ context->body_off = i;
+ context->commit_header_parsed = 1;
+}
+
+static void format_commit_item(struct strbuf *sb, const char *placeholder,
+ void *context)
+{
+ struct format_commit_context *c = context;
+ const struct commit *commit = c->commit;
const char *msg = commit->buffer;
+ struct commit_list *p;
/* these are independent of the commit */
switch (placeholder[0]) {
@@ -429,45 +482,28 @@ static void format_commit_item(struct strbuf *sb, const char *placeholder,
}
/* For the rest we have to parse the commit header. */
- for (i = 0, state = HEADER; msg[i] && state < BODY; i++) {
- int eol;
- for (eol = i; msg[eol] && msg[eol] != '\n'; eol++)
- ; /* do nothing */
+ if (!c->commit_header_parsed)
+ parse_commit_header(c);
- if (state == SUBJECT) {
- if (placeholder[0] == 's') {
- strbuf_add(sb, msg + i, eol - i);
- return;
- }
- i = eol;
- }
- if (i == eol) {
- state++;
- /* strip empty lines */
- while (msg[eol + 1] == '\n')
- eol++;
- } else if (!prefixcmp(msg + i, "author ")) {
- if (placeholder[0] == 'a') {
- format_person_part(sb, placeholder[1],
- msg + i + 7, eol - i - 7);
- return;
- }
- } else if (!prefixcmp(msg + i, "committer ")) {
- if (placeholder[0] == 'c') {
- format_person_part(sb, placeholder[1],
- msg + i + 10, eol - i - 10);
- return;
- }
- } else if (!prefixcmp(msg + i, "encoding ")) {
- if (placeholder[0] == 'e') {
- strbuf_add(sb, msg + i + 9, eol - i - 9);
- return;
- }
- }
- i = eol;
+ switch (placeholder[0]) {
+ case 's':
+ strbuf_add(sb, msg + c->subject.off, c->subject.len);
+ return;
+ case 'a':
+ format_person_part(sb, placeholder[1],
+ msg + c->author.off, c->author.len);
+ return;
+ case 'c':
+ format_person_part(sb, placeholder[1],
+ msg + c->committer.off, c->committer.len);
+ return;
+ case 'e':
+ strbuf_add(sb, msg + c->encoding.off, c->encoding.len);
+ return;
+ case 'b':
+ strbuf_addstr(sb, msg + c->body_off);
+ return;
}
- if (msg[i] && placeholder[0] == 'b') /* body */
- strbuf_addstr(sb, msg + i);
}
void format_commit_message(const struct commit *commit,
@@ -505,7 +541,11 @@ void format_commit_message(const struct commit *commit,
"m", /* left/right/bottom */
NULL
};
- strbuf_expand(sb, format, placeholders, format_commit_item, (void *)commit);
+ struct format_commit_context context;
+
+ memset(&context, 0, sizeof(context));
+ context.commit = commit;
+ strbuf_expand(sb, format, placeholders, format_commit_item, &context);
}
static void pp_header(enum cmit_fmt fmt,
--
1.5.3.5.1651.g30bf
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 2/3] add strbuf_adddup()
2007-11-10 0:46 ` Jeff King
2007-11-10 11:12 ` René Scharfe
2007-11-10 11:14 ` [PATCH 1/3] --pretty=format: parse commit message only once René Scharfe
@ 2007-11-10 11:16 ` René Scharfe
2007-11-10 11:18 ` [PATCH 3/3] --format=pretty: avoid calculating expensive expansions twice René Scharfe
3 siblings, 0 replies; 24+ messages in thread
From: René Scharfe @ 2007-11-10 11:16 UTC (permalink / raw)
To: Jeff King, Junio C Hamano
Cc: Paul Mackerras, Git Mailing List, Pierre Habouzit,
Johannes Schindelin
Add a new function, strbuf_adddup(), that appends a duplicate of a
part of a struct strbuf to end of the latter.
Signed-off-by: Rene Scharfe <rene.scharfe@lsrfire.ath.cx>
---
strbuf.c | 7 +++++++
strbuf.h | 1 +
2 files changed, 8 insertions(+), 0 deletions(-)
diff --git a/strbuf.c b/strbuf.c
index 536b432..dbd8c4b 100644
--- a/strbuf.c
+++ b/strbuf.c
@@ -106,6 +106,13 @@ void strbuf_add(struct strbuf *sb, const void *data, size_t len)
strbuf_setlen(sb, sb->len + len);
}
+void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len)
+{
+ strbuf_grow(sb, len);
+ memcpy(sb->buf + sb->len, sb->buf + pos, len);
+ strbuf_setlen(sb, sb->len + len);
+}
+
void strbuf_addf(struct strbuf *sb, const char *fmt, ...)
{
int len;
diff --git a/strbuf.h b/strbuf.h
index 21d8944..1391912 100644
--- a/strbuf.h
+++ b/strbuf.h
@@ -101,6 +101,7 @@ static inline void strbuf_addstr(struct strbuf *sb, const char *s) {
static inline void strbuf_addbuf(struct strbuf *sb, struct strbuf *sb2) {
strbuf_add(sb, sb2->buf, sb2->len);
}
+extern void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len);
typedef void (*expand_fn_t) (struct strbuf *sb, const char *placeholder, void *context);
extern void strbuf_expand(struct strbuf *sb, const char *format, const char **placeholders, expand_fn_t fn, void *context);
--
1.5.3.5.1651.g30bf
^ permalink raw reply related [flat|nested] 24+ messages in thread
* [PATCH 3/3] --format=pretty: avoid calculating expensive expansions twice
2007-11-10 0:46 ` Jeff King
` (2 preceding siblings ...)
2007-11-10 11:16 ` [PATCH 2/3] add strbuf_adddup() René Scharfe
@ 2007-11-10 11:18 ` René Scharfe
2007-11-11 10:29 ` Junio C Hamano
3 siblings, 1 reply; 24+ messages in thread
From: René Scharfe @ 2007-11-10 11:18 UTC (permalink / raw)
To: Jeff King, Junio C Hamano
Cc: Paul Mackerras, Git Mailing List, Pierre Habouzit,
Johannes Schindelin
As Jeff King remarked, format strings with duplicate placeholders can
be slow to expand, because each instance is calculated anew.
This patch makes use of the fact that format_commit_message() and its
helper functions only ever add stuff to the end of the strbuf. For
certain expensive placeholders, store the offset and length of their
expansion with the strbuf at the first occurrence. Later they
expansion result can simply be copied from there -- no malloc() or
strdup() required.
These certain placeholders are the abbreviated commit, tree and
parent hashes, as the search for a unique abbreviated hash is quite
costly. Here are the times for next (best of three runs):
$ time git log --pretty=format:%h >/dev/null
real 0m0.611s
user 0m0.404s
sys 0m0.204s
$ time git log --pretty=format:%h%h%h%h >/dev/null
real 0m1.206s
user 0m0.744s
sys 0m0.452s
And here those with this patch (and the previous two); the speedup
of the single placeholder case is just noise:
$ time git log --pretty=format:%h >/dev/null
real 0m0.608s
user 0m0.416s
sys 0m0.192s
$ time git log --pretty=format:%h%h%h%h >/dev/null
real 0m0.639s
user 0m0.488s
sys 0m0.140s
Signed-off-by: Rene Scharfe <rene.scharfe@lsrfire.ath.cx>
---
pretty.c | 32 ++++++++++++++++++++++++++++++++
1 files changed, 32 insertions(+), 0 deletions(-)
diff --git a/pretty.c b/pretty.c
index 0c2f83b..ab142b8 100644
--- a/pretty.c
+++ b/pretty.c
@@ -369,8 +369,30 @@ struct format_commit_context {
struct chunk committer;
struct chunk encoding;
size_t body_off;
+
+ /* The following ones are relative to the result struct strbuf. */
+ struct chunk abbrev_commit_hash;
+ struct chunk abbrev_tree_hash;
+ struct chunk abbrev_parent_hashes;
};
+static int add_again(struct strbuf *sb, struct chunk *chunk)
+{
+ if (chunk->len) {
+ strbuf_adddup(sb, chunk->off, chunk->len);
+ return 1;
+ }
+
+ /*
+ * We haven't seen this chunk before. Our caller is surely
+ * going to add it the hard way now. Remember the most likely
+ * start of the to-be-added chunk: the current end of the
+ * struct strbuf.
+ */
+ chunk->off = sb->len;
+ return 0;
+}
+
static void parse_commit_header(struct format_commit_context *context)
{
const char *msg = context->commit->buffer;
@@ -447,15 +469,21 @@ static void format_commit_item(struct strbuf *sb, const char *placeholder,
strbuf_addstr(sb, sha1_to_hex(commit->object.sha1));
return;
case 'h': /* abbreviated commit hash */
+ if (add_again(sb, &c->abbrev_commit_hash))
+ return;
strbuf_addstr(sb, find_unique_abbrev(commit->object.sha1,
DEFAULT_ABBREV));
+ c->abbrev_commit_hash.len = sb->len - c->abbrev_commit_hash.off;
return;
case 'T': /* tree hash */
strbuf_addstr(sb, sha1_to_hex(commit->tree->object.sha1));
return;
case 't': /* abbreviated tree hash */
+ if (add_again(sb, &c->abbrev_tree_hash))
+ return;
strbuf_addstr(sb, find_unique_abbrev(commit->tree->object.sha1,
DEFAULT_ABBREV));
+ c->abbrev_tree_hash.len = sb->len - c->abbrev_tree_hash.off;
return;
case 'P': /* parent hashes */
for (p = commit->parents; p; p = p->next) {
@@ -465,12 +493,16 @@ static void format_commit_item(struct strbuf *sb, const char *placeholder,
}
return;
case 'p': /* abbreviated parent hashes */
+ if (add_again(sb, &c->abbrev_parent_hashes))
+ return;
for (p = commit->parents; p; p = p->next) {
if (p != commit->parents)
strbuf_addch(sb, ' ');
strbuf_addstr(sb, find_unique_abbrev(
p->item->object.sha1, DEFAULT_ABBREV));
}
+ c->abbrev_parent_hashes.len = sb->len -
+ c->abbrev_parent_hashes.off;
return;
case 'm': /* left/right/bottom */
strbuf_addch(sb, (commit->object.flags & BOUNDARY)
--
1.5.3.5.1651.g30bf
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] --pretty=format: on-demand format expansion
2007-11-10 11:12 ` René Scharfe
@ 2007-11-10 16:07 ` Johannes Schindelin
2007-11-10 16:24 ` René Scharfe
2007-11-10 20:36 ` Jeff King
2007-11-10 20:34 ` Jeff King
1 sibling, 2 replies; 24+ messages in thread
From: Johannes Schindelin @ 2007-11-10 16:07 UTC (permalink / raw)
To: René Scharfe
Cc: Jeff King, Junio C Hamano, Paul Mackerras, Git Mailing List,
Pierre Habouzit
Hi,
On Sat, 10 Nov 2007, Ren? Scharfe wrote:
> [...] have cooked up a different one on top of a cleaned up version of
> mine. It plays the dirty trick of reading expansions of repeated
> placeholders from the strbuf..
... which would not work (likely even segfault) if you work with the same
private data on different strbufs.
But I guess it will not matter much in practice.
Ciao,
Dscho
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] --pretty=format: on-demand format expansion
2007-11-10 16:07 ` Johannes Schindelin
@ 2007-11-10 16:24 ` René Scharfe
2007-11-10 20:36 ` Jeff King
1 sibling, 0 replies; 24+ messages in thread
From: René Scharfe @ 2007-11-10 16:24 UTC (permalink / raw)
To: Johannes Schindelin
Cc: Jeff King, Junio C Hamano, Paul Mackerras, Git Mailing List,
Pierre Habouzit
Johannes Schindelin schrieb:
> Hi,
>
> On Sat, 10 Nov 2007, René Scharfe wrote:
>
>> [...] have cooked up a different one on top of a cleaned up version of
>> mine. It plays the dirty trick of reading expansions of repeated
>> placeholders from the strbuf..
>
> ... which would not work (likely even segfault) if you work with the same
> private data on different strbufs.
>
> But I guess it will not matter much in practice.
Only a single strbuf is used, and the function that copies the data
around, strbuf_adddup(), operates on a single strbuf, only. Copying
data between two strbufs using strbuf_add() etc. would be safe.
What one should *not* do is this:
strbuf_add(sb, sb->buf + offset, length);
This leads to problems when the buffer is realloc()ated by strbuf_add().
What other things can go wrong? A segfault would definitely matter..
Thanks,
René
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] --pretty=format: on-demand format expansion
2007-11-10 11:12 ` René Scharfe
2007-11-10 16:07 ` Johannes Schindelin
@ 2007-11-10 20:34 ` Jeff King
2007-11-11 8:13 ` Jeff King
1 sibling, 1 reply; 24+ messages in thread
From: Jeff King @ 2007-11-10 20:34 UTC (permalink / raw)
To: René Scharfe
Cc: Junio C Hamano, Paul Mackerras, Git Mailing List, Pierre Habouzit,
Johannes Schindelin
On Sat, Nov 10, 2007 at 12:12:33PM +0100, René Scharfe wrote:
> Yes, the two are independent. I don't like the malloc()'s in your
> patch, though, and have cooked up a different one on top of a cleaned up
> version of mine. It plays the dirty trick of reading expansions of
> repeated placeholders from the strbuf..
Yes, the mallocs would obviously not be necessary for the commit
parsing, since we already have a buffer, but I assumed they would be for
the abbreviations.
Your solution leaves me partly disgusted at the hackishness, and partly
delighted at the cleverness. I think the way you have coded it is quite
clear, though, so I don't think anyone is likely to get confused reading
it. I haven't read through it carefully yet, but a tentative Ack from
me.
-Peff
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] --pretty=format: on-demand format expansion
2007-11-10 16:07 ` Johannes Schindelin
2007-11-10 16:24 ` René Scharfe
@ 2007-11-10 20:36 ` Jeff King
1 sibling, 0 replies; 24+ messages in thread
From: Jeff King @ 2007-11-10 20:36 UTC (permalink / raw)
To: Johannes Schindelin
Cc: René Scharfe, Junio C Hamano, Paul Mackerras,
Git Mailing List, Pierre Habouzit
On Sat, Nov 10, 2007 at 04:07:10PM +0000, Johannes Schindelin wrote:
> > [...] have cooked up a different one on top of a cleaned up version of
> > mine. It plays the dirty trick of reading expansions of repeated
> > placeholders from the strbuf..
>
> ... which would not work (likely even segfault) if you work with the same
> private data on different strbufs.
The nice thing is that the cache validity only lasts for the call to
strbuf_expand (and its callbacks), so there is really very little chance
of somebody mis-using this.
-Peff
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 2/2] --pretty=format: on-demand format expansion
2007-11-10 20:34 ` Jeff King
@ 2007-11-11 8:13 ` Jeff King
0 siblings, 0 replies; 24+ messages in thread
From: Jeff King @ 2007-11-11 8:13 UTC (permalink / raw)
To: René Scharfe
Cc: Junio C Hamano, Paul Mackerras, Git Mailing List, Pierre Habouzit,
Johannes Schindelin
On Sat, Nov 10, 2007 at 03:34:35PM -0500, Jeff King wrote:
> Your solution leaves me partly disgusted at the hackishness, and partly
> delighted at the cleverness. I think the way you have coded it is quite
> clear, though, so I don't think anyone is likely to get confused reading
> it. I haven't read through it carefully yet, but a tentative Ack from
> me.
OK, I had a chance to read through your patches more carefully. Assuming
that the strbuf offset technique is acceptable (and I think it is worth
the increased speed and simplicity), they look great to me. So:
Acked-by: Jeff King <peff@peff.net>
-Peff
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH 3/3] --format=pretty: avoid calculating expensive expansions twice
2007-11-10 11:18 ` [PATCH 3/3] --format=pretty: avoid calculating expensive expansions twice René Scharfe
@ 2007-11-11 10:29 ` Junio C Hamano
0 siblings, 0 replies; 24+ messages in thread
From: Junio C Hamano @ 2007-11-11 10:29 UTC (permalink / raw)
To: René Scharfe
Cc: Jeff King, Paul Mackerras, Git Mailing List, Pierre Habouzit,
Johannes Schindelin
René Scharfe <rene.scharfe@lsrfire.ath.cx> writes:
> +static int add_again(struct strbuf *sb, struct chunk *chunk)
> +{
> + if (chunk->len) {
> + strbuf_adddup(sb, chunk->off, chunk->len);
> + return 1;
> + }
> +
> + /*
> + * We haven't seen this chunk before. Our caller is surely
> + * going to add it the hard way now. Remember the most likely
> + * start of the to-be-added chunk: the current end of the
> + * struct strbuf.
> + */
> + chunk->off = sb->len;
> + return 0;
> +}
> +
> static void parse_commit_header(struct format_commit_context *context)
> {
> const char *msg = context->commit->buffer;
> @@ -447,15 +469,21 @@ static void format_commit_item(struct strbuf *sb, const char *placeholder,
> strbuf_addstr(sb, sha1_to_hex(commit->object.sha1));
> return;
> case 'h': /* abbreviated commit hash */
> + if (add_again(sb, &c->abbrev_commit_hash))
> + return;
> strbuf_addstr(sb, find_unique_abbrev(commit->object.sha1,
> DEFAULT_ABBREV));
> + c->abbrev_commit_hash.len = sb->len - c->abbrev_commit_hash.off;
> return;
Brilliant. Doubly brilliant is the adddup abstraction that does
not suffer from underlying strbuf being reallocated.
Me likee..
^ permalink raw reply [flat|nested] 24+ messages in thread
end of thread, other threads:[~2007-11-11 10:29 UTC | newest]
Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-11-09 0:49 [PATCH 2/2] --pretty=format: on-demand format expansion René Scharfe
2007-11-09 1:24 ` Johannes Schindelin
2007-11-09 21:13 ` René Scharfe
2007-11-09 22:18 ` Johannes Schindelin
2007-11-09 22:30 ` Junio C Hamano
2007-11-09 4:50 ` Jeff King
2007-11-09 23:16 ` René Scharfe
2007-11-10 0:31 ` Johannes Schindelin
2007-11-10 0:49 ` Jeff King
2007-11-10 0:46 ` Jeff King
2007-11-10 11:12 ` René Scharfe
2007-11-10 16:07 ` Johannes Schindelin
2007-11-10 16:24 ` René Scharfe
2007-11-10 20:36 ` Jeff King
2007-11-10 20:34 ` Jeff King
2007-11-11 8:13 ` Jeff King
2007-11-10 11:14 ` [PATCH 1/3] --pretty=format: parse commit message only once René Scharfe
2007-11-10 11:16 ` [PATCH 2/3] add strbuf_adddup() René Scharfe
2007-11-10 11:18 ` [PATCH 3/3] --format=pretty: avoid calculating expensive expansions twice René Scharfe
2007-11-11 10:29 ` Junio C Hamano
2007-11-09 4:52 ` [PATCH 2/2] --pretty=format: on-demand format expansion Jeff King
2007-11-09 23:20 ` René Scharfe
2007-11-10 0:51 ` Jeff King
2007-11-09 23:39 ` Paul Mackerras
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).