[PATCH] git-blame: Make the output human readable

git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] git-blame: Make the output human readable
@ 2006-03-05 11:03 Fredrik Kuivinen
  2006-03-05 12:10 ` Junio C Hamano
  0 siblings, 1 reply; 12+ messages in thread
From: Fredrik Kuivinen @ 2006-03-05 11:03 UTC (permalink / raw)
  To: git; +Cc: junkio

The default output mode is slightly different from git-annotate's.
However, git-annotate's output mode can be obtained by using the
'-c' flag.

Signed-off-by: Fredrik Kuivinen <freku045@student.liu.se>


---

 Makefile |    4 ++
 blame.c  |  164 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 149 insertions(+), 19 deletions(-)

3459b7c5032d1462d5857cf6afc2d3e3ef61b93b
diff --git a/Makefile b/Makefile
index b6d8804..eb1887d 100644
--- a/Makefile
+++ b/Makefile
@@ -534,6 +534,10 @@ git-rev-list$X: rev-list.o $(LIB_FILE)
 	$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
 		$(LIBS) $(OPENSSL_LIBSSL)
 
+git-blame$X: blame.o $(LIB_FILE)
+	$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
+		$(LIBS) -lm
+
 init-db.o: init-db.c
 	$(CC) -c $(ALL_CFLAGS) \
 		-DDEFAULT_GIT_TEMPLATE_DIR='"$(template_dir_SQ)"' $*.c
diff --git a/blame.c b/blame.c
index 7308c36..6dccae5 100644
--- a/blame.c
+++ b/blame.c
@@ -5,6 +5,7 @@
 #include <assert.h>
 #include <time.h>
 #include <sys/time.h>
+#include <math.h>
 
 #include "cache.h"
 #include "refs.h"
@@ -17,8 +18,15 @@
 
 #define DEBUG 0
 
-struct commit **blame_lines;
-int num_blame_lines;
+static const char blame_usage[] = "[-c] [-l] [--] file [commit]\n"
+	"  -c, --compability Use the same output mode as git-annotate (Default: off)\n"
+	"  -l, --long        Show long commit SHA1 (Default: off)\n"
+	"  -h, --help        This message";
+
+static struct commit **blame_lines;
+static int num_blame_lines;
+static char* blame_contents;
+static int blame_len;
 
 struct util_info {
 	int *line_map;
@@ -388,9 +396,8 @@ static void init_first_commit(struct com
 	alloc_line_map(commit);
 
 	util = commit->object.util;
-	num_blame_lines = util->num_lines;
 
-	for (i = 0; i < num_blame_lines; i++)
+	for (i = 0; i < util->num_lines; i++)
 		util->line_map[i] = i;
 }
 
@@ -412,6 +419,9 @@ static void process_commits(struct rev_i
 	util = commit->object.util;
 	num_blame_lines = util->num_lines;
 	blame_lines = xmalloc(sizeof(struct commit *) * num_blame_lines);
+	blame_contents = util->buf;
+	blame_len = util->size;
+
 	for (i = 0; i < num_blame_lines; i++)
 		blame_lines[i] = NULL;
 
@@ -499,32 +509,128 @@ static void process_commits(struct rev_i
 	} while ((commit = get_revision(rev)) != NULL);
 }
 
+struct commit_info
+{
+	char* author;
+	char* author_mail;
+	unsigned long author_time;
+	char* author_tz;
+};
+
+static void get_commit_info(struct commit* commit, struct commit_info* ret)
+{
+	int len;
+	char* tmp;
+	static char author_buf[1024];
+
+	tmp = strstr(commit->buffer, "\nauthor ") + 8;
+	len = index(tmp, '\n') - tmp;
+	ret->author = author_buf;
+	memcpy(ret->author, tmp, len);
+
+	tmp = ret->author;
+	tmp += len;
+	*tmp = 0;
+	while(*tmp != ' ')
+		tmp--;
+	ret->author_tz = tmp+1;
+
+	*tmp = 0;
+	while(*tmp != ' ')
+		tmp--;
+	ret->author_time = strtoul(tmp, NULL, 10);
+
+	*tmp = 0;
+	while(*tmp != ' ')
+		tmp--;
+	ret->author_mail = tmp + 1;
+
+	*tmp = 0;
+}
+
+char* format_time(unsigned long time, const char* tz)
+{
+	static char time_buf[128];
+	time_t t = time;
+
+	strftime(time_buf, sizeof(time_buf), "%Y-%m-%d %H:%M:%S ", gmtime(&t));
+	strcat(time_buf, tz);
+	return time_buf;
+}
+
 int main(int argc, const char **argv)
 {
 	int i;
 	struct commit *initial = NULL;
 	unsigned char sha1[20];
-	const char* filename;
+
+	const char *filename = NULL, *commit = NULL;
+	char filename_buf[256];
+	int sha1_len = 8;
+	int compability = 0;
+	int options = 1;
+
 	int num_args;
 	const char* args[10];
 	struct rev_info rev;
 
-	setup_git_directory();
-
-	if (argc != 3)
-		die("Usage: blame commit-ish file");
+	struct commit_info ci;
+	const char *buf;
+	int max_digits;
+
+	const char* prefix = setup_git_directory();
+
+	for(i = 1; i < argc; i++) {
+		if(options) {
+			if(!strcmp(argv[i], "-h") ||
+			   !strcmp(argv[i], "--help"))
+				usage(blame_usage);
+			else if(!strcmp(argv[i], "-l") ||
+				!strcmp(argv[i], "--long")) {
+				sha1_len = 20;
+				continue;
+			} else if(!strcmp(argv[i], "-c") ||
+				  !strcmp(argv[i], "--compability")) {
+				compability = 1;
+				continue;
+			} else if(!strcmp(argv[i], "--")) {
+				options = 0;
+				continue;
+			} else if(argv[i][0] == '-')
+				usage(blame_usage);
+			else
+				options = 0;
+		}
 
+		if(!options) {
+			if(!filename)
+				filename = argv[i];
+			else if(!commit)
+				commit = argv[i];
+			else
+				usage(blame_usage);
+		}
+	}
 
-	filename = argv[2];
+	if(!filename)
+		usage(blame_usage);
+	if(!commit)
+		commit = "HEAD";
+
+	if(prefix)
+		sprintf(filename_buf, "%s%s", prefix, filename);
+	else
+		strcpy(filename_buf, filename);
+	filename = filename_buf;
 
 	{
-		struct commit* commit;
-		if (get_sha1(argv[1], sha1))
-			die("get_sha1 failed");
-		commit = lookup_commit_reference(sha1);
+		struct commit* c;
+		if (get_sha1(commit, sha1))
+			die("get_sha1 failed, commit '%s' not found", commit);
+		c = lookup_commit_reference(sha1);
 
-		if (fill_util_info(commit, filename)) {
-			printf("%s not found in %s\n", filename, argv[1]);
+		if (fill_util_info(c, filename)) {
+			printf("%s not found in %s\n", filename, commit);
 			return 1;
 		}
 	}
@@ -533,7 +639,7 @@ int main(int argc, const char **argv)
 	args[num_args++] = NULL;
 	args[num_args++] = "--topo-order";
 	args[num_args++] = "--remove-empty";
-	args[num_args++] = argv[1];
+	args[num_args++] = commit;
 	args[num_args++] = "--";
 	args[num_args++] = filename;
 	args[num_args] = NULL;
@@ -542,13 +648,33 @@ int main(int argc, const char **argv)
 	prepare_revision_walk(&rev);
 	process_commits(&rev, filename, &initial);
 
+	buf = blame_contents;
+	max_digits = 1 + log(num_blame_lines+1)/log(10);
 	for (i = 0; i < num_blame_lines; i++) {
 		struct commit *c = blame_lines[i];
 		if (!c)
 			c = initial;
 
-		printf("%d %.8s\n", i, sha1_to_hex(c->object.sha1));
-// printf("%d %s\n", i, find_unique_abbrev(blame_lines[i]->object.sha1, 6));
+		get_commit_info(c, &ci);
+		fwrite(sha1_to_hex(c->object.sha1), sha1_len, 1, stdout);
+		if(compability)
+			printf("\t(%10s\t%10s\t%d)", ci.author,
+			       format_time(ci.author_time, ci.author_tz), i+1);
+		else
+			printf(" (%-15.15s %10s %*d) ", ci.author,
+			       format_time(ci.author_time, ci.author_tz),
+			       max_digits, i+1);
+
+		if(i == num_blame_lines - 1) {
+			fwrite(buf, blame_len - (buf - blame_contents),
+			       1, stdout);
+			if(blame_contents[blame_len-1] != '\n')
+				putc('\n', stdout);
+		} else {
+			char* next_buf = index(buf, '\n') + 1;
+			fwrite(buf, next_buf - buf, 1, stdout);
+			buf = next_buf;
+		}
 	}
 
 	if (DEBUG) {
-- 
1.2.4.g4644-dirty

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] git-blame: Make the output human readable
  2006-03-05 11:03 Fredrik Kuivinen
@ 2006-03-05 12:10 ` Junio C Hamano
  2006-03-05 12:38   ` Fredrik Kuivinen
  2006-03-05 14:11   ` Johannes Schindelin
  0 siblings, 2 replies; 12+ messages in thread
From: Junio C Hamano @ 2006-03-05 12:10 UTC (permalink / raw)
  To: Fredrik Kuivinen; +Cc: git, Johannes Schindelin, Ryan Anderson

Fredrik Kuivinen <freku045@student.liu.se> writes:

> The default output mode is slightly different from git-annotate's.
> However, git-annotate's output mode can be obtained by using the
> '-c' flag.

It might be better to default to human readable and make the
script consumption format an option, if only to reduce typing.

> diff --git a/Makefile b/Makefile
> index b6d8804..eb1887d 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -534,6 +534,10 @@ git-rev-list$X: rev-list.o $(LIB_FILE)
>  	$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
>  		$(LIBS) $(OPENSSL_LIBSSL)
>  
> +git-blame$X: blame.o $(LIB_FILE)
> +	$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
> +		$(LIBS) -lm
> +

I wonder what it is about to link this binary different from others...

> +char* format_time(unsigned long time, const char* tz)
> +{
> +	static char time_buf[128];
> +	time_t t = time;
> +
> +	strftime(time_buf, sizeof(time_buf), "%Y-%m-%d %H:%M:%S ", gmtime(&t));
> +	strcat(time_buf, tz);
> +	return time_buf;
> +}

I think this shows GMT with time offset, which is compatible
with the human readable time Johannes did to git-annotate.  I do
not know what timezone CVS annotate shows its dates offhand (it
seems to only show dates).  Johannes, is this an attempt to
match what CVS does?

I am wondering if we want to be in line with the date formatting
convention used for our commits and tags, that is, to show local
timestamp with timezone.  The code to use would be show_date()
from date.c if we go that route.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] git-blame: Make the output human readable
  2006-03-05 12:10 ` Junio C Hamano
@ 2006-03-05 12:38   ` Fredrik Kuivinen
  2006-03-05 14:23     ` Johannes Schindelin
  2006-03-05 21:28     ` Junio C Hamano
  2006-03-05 14:11   ` Johannes Schindelin
  1 sibling, 2 replies; 12+ messages in thread
From: Fredrik Kuivinen @ 2006-03-05 12:38 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Fredrik Kuivinen, git, Johannes Schindelin, Ryan Anderson

On Sun, Mar 05, 2006 at 04:10:47AM -0800, Junio C Hamano wrote:
> Fredrik Kuivinen <freku045@student.liu.se> writes:
> 
> > The default output mode is slightly different from git-annotate's.
> > However, git-annotate's output mode can be obtained by using the
> > '-c' flag.
> 
> It might be better to default to human readable and make the
> script consumption format an option, if only to reduce typing.
> 

The default output format is human readable, but it is different from
the output format used by git-annotate. By default, (when the patch is
applied) git-blame outputs lines with on the following form:

   <eight first digits of commit SHA1> (<first 15 letters of committer's name> YYYY-MM-DD HH:MM:SS TZ <line number, right justified>) file contents

where as git-annotate uses

   <eight first digits of commit SHA1>\t(<committer's name, right justified, padded to 10 characters> YYYY-MM-DD HH:MM:SS TZ\t<line number>)file contents

I find the first format easier to read since everything is aligned (we
always output 15 characters for the committer's name padded with
spaces if necessary and the line numbers are padded appropriately). It
also takes up less space on screen since it doesn't use tabs.

However, I wanted to use the tests for git-annotate to test git-blame
too and the tests do, of course, expect the output to be in
git-annotate's format. Hence, the '-c' switch.

We may want to add an output format suitable for scripts too, which
just lists the SHA1. But I don't think it is much more difficult to
parse the format above, at least not if you just want the SHA1s.

> > diff --git a/Makefile b/Makefile
> > index b6d8804..eb1887d 100644
> > --- a/Makefile
> > +++ b/Makefile
> > @@ -534,6 +534,10 @@ git-rev-list$X: rev-list.o $(LIB_FILE)
> >  	$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
> >  		$(LIBS) $(OPENSSL_LIBSSL)
> >  
> > +git-blame$X: blame.o $(LIB_FILE)
> > +	$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
> > +		$(LIBS) -lm
> > +
> 
> I wonder what it is about to link this binary different from others...
> 

It uses log(3) to compute the number of digits needed to represent the
last line number. It is probably better to this some other way
though...

> > +char* format_time(unsigned long time, const char* tz)
> > +{
> > +	static char time_buf[128];
> > +	time_t t = time;
> > +
> > +	strftime(time_buf, sizeof(time_buf), "%Y-%m-%d %H:%M:%S ", gmtime(&t));
> > +	strcat(time_buf, tz);
> > +	return time_buf;
> > +}
> 
> I think this shows GMT with time offset, which is compatible
> with the human readable time Johannes did to git-annotate.  I do
> not know what timezone CVS annotate shows its dates offhand (it
> seems to only show dates).  Johannes, is this an attempt to
> match what CVS does?
> 
> I am wondering if we want to be in line with the date formatting
> convention used for our commits and tags, that is, to show local
> timestamp with timezone.  The code to use would be show_date()
> from date.c if we go that route.
> 

I think it is a good idea. Consistency is good.

- Fredrik

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] git-blame: Make the output human readable
  2006-03-05 12:10 ` Junio C Hamano
  2006-03-05 12:38   ` Fredrik Kuivinen
@ 2006-03-05 14:11   ` Johannes Schindelin
  1 sibling, 0 replies; 12+ messages in thread
From: Johannes Schindelin @ 2006-03-05 14:11 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Fredrik Kuivinen, git, Ryan Anderson

Hi,

On Sun, 5 Mar 2006, Junio C Hamano wrote:

> Fredrik Kuivinen <freku045@student.liu.se> writes:
> 
> > +char* format_time(unsigned long time, const char* tz)
> > +{
> > +	static char time_buf[128];
> > +	time_t t = time;
> > +
> > +	strftime(time_buf, sizeof(time_buf), "%Y-%m-%d %H:%M:%S ", gmtime(&t));
> > +	strcat(time_buf, tz);
> > +	return time_buf;
> > +}
> 
> I think this shows GMT with time offset, which is compatible
> with the human readable time Johannes did to git-annotate.  I do
> not know what timezone CVS annotate shows its dates offhand (it
> seems to only show dates).  Johannes, is this an attempt to
> match what CVS does?

CVS only shows the date, something like

	strftime("%Y-%b-%d", gmtime($timestamp));

> I am wondering if we want to be in line with the date formatting
> convention used for our commits and tags, that is, to show local
> timestamp with timezone.  The code to use would be show_date()
> from date.c if we go that route.

I like that approach. Sometimes imitating CVS can be overdone.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] git-blame: Make the output human readable
  2006-03-05 12:38   ` Fredrik Kuivinen
@ 2006-03-05 14:23     ` Johannes Schindelin
  2006-03-05 21:28     ` Junio C Hamano
  1 sibling, 0 replies; 12+ messages in thread
From: Johannes Schindelin @ 2006-03-05 14:23 UTC (permalink / raw)
  To: Fredrik Kuivinen; +Cc: Junio C Hamano, git

Hi,

On Sun, 5 Mar 2006, Fredrik Kuivinen wrote:

> On Sun, Mar 05, 2006 at 04:10:47AM -0800, Junio C Hamano wrote:
> > Fredrik Kuivinen <freku045@student.liu.se> writes:
> > 
> > > +git-blame$X: blame.o $(LIB_FILE)
> > > +	$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
> > > +		$(LIBS) -lm
> > > +
> > 
> > I wonder what it is about to link this binary different from others...
> > 
> 
> It uses log(3) to compute the number of digits needed to represent the
> last line number. It is probably better to this some other way
> though...

How about this:

-	max_digits = 1 + log(num_blame_lines+1)/log(10);
+	for (i = 10, max_digits = 1; i <= num_blame_lines + 1;
+		i *= 10, max_digits++);

(Totally untested)

Hth,
Dscho

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] git-blame: Make the output human readable
  2006-03-05 12:38   ` Fredrik Kuivinen
  2006-03-05 14:23     ` Johannes Schindelin
@ 2006-03-05 21:28     ` Junio C Hamano
  2006-03-07 16:34       ` Fredrik Kuivinen
  1 sibling, 1 reply; 12+ messages in thread
From: Junio C Hamano @ 2006-03-05 21:28 UTC (permalink / raw)
  To: Fredrik Kuivinen; +Cc: git, Johannes Schindelin, Ryan Anderson

Fredrik Kuivinen <freku045@student.liu.se> writes:

> I find the first format easier to read since everything is aligned (we
> always output 15 characters for the committer's name padded with
> spaces if necessary and the line numbers are padded appropriately). It
> also takes up less space on screen since it doesn't use tabs.

Careful.  The convention for names is to encode them in UTF-8,
so if you mean 15 "characters" that might be OK but if it can
truncate in the middle of a multibyte sequence of UTF-8 encoded
single character it is a no-no.

We are talking about "casual" aligning, so I would not bring up
"proportional fonts", but even on a monospace terminal and
line-printer output, there still is the issue of byte count not
matching columns.  In an i18nized environment, aligning columns
by counting bytes is a lost battle.

In any case, please make sure you do not chop a character in the
middle at least [*1*].

> We may want to add an output format suitable for scripts too, which
> just lists the SHA1. But I don't think it is much more difficult to
> parse the format above, at least not if you just want the SHA1s.

Fair enough.

> It uses log(3) to compute the number of digits needed to represent the
> last line number. It is probably better to this some other way
> though...

Yup.

[Footnote]

*1* I was kind of impressed that Linus was careful about this
issue when I saw the commit log chopping is only done at line
boundaries.  A very careful coder would have chopped the last
line in the middle at character boundary, but if you do not want
to bother that much, chopping at line boundary is better than
chopping the last line in the middle of a character ;-).

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] git-blame: Make the output human readable
@ 2006-03-06 19:33 linux
  2006-03-08 14:32 ` Sergey Vlasov
  0 siblings, 1 reply; 12+ messages in thread
From: linux @ 2006-03-06 19:33 UTC (permalink / raw)
  To: junkio; +Cc: git

Well, getting 15 characters in UTF-8 is easy (just stop before the 16th
byte for which ((b & 0xc0) != 0x80)), but what about combining characters?

You've got accents and stuff to worry about.  And the annoying fact that
Unicode defined accents as suffixes, so you have to go past the 15th
column to include all of the 

And then there's that fact that many characters are traditionally
represented as double-wide forms, even on character terminals.

See http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c for details
an an example implementation of wcwidth().

Using that, it would be something like (compiles but untested):

/*
 * Return the number of bytes from the nul-terminated utf8 string
 * that can be printed in at most max columns using a monospaced
 * font.  *actual returns the number of columns actually occupied,
 * which may be less than max.
 *
 * Output is truncated before any control characters or illegal
 * UTF-8 sequences.
 */
unsigned
fit_columns(char const *utf8, unsigned max, unsigned *actual)
{
	char const * const origin = utf8;
	unsigned width = 0;
	unsigned pos = 0;
	unsigned c;

	for (;;) {
		unsigned w;
		unsigned c = *utf8++;

		/* Part 1: Parse the next Unicode code point */
		if (c < 0x20) {
			break;	/* Control character - stop */
		} else if (c < 0x7F) {
			w = 1;	/* Standard ASCII */
		} else if (c < 0xC2 || c > 0xF4) {
			break;	/* DEL or illegal Unicode */
		} else {
			/* Multi-byte UTF-8 sequence */
			unsigned n;
			unsigned char byte = *utf8++;

			if (c < 0xE0) {
				/* 2-byte sequence: U+0080..U+07FF */
				n = 1;
				c &= 0x1F;
			} else if (c < 0xF0) {
				/* 3-byte sequence: U+0800..U+FFFF */
				if (c == 0xE0 && byte < 0xA0)
					break;	/* < /U+0800 */
				n = 2;
				c &= 0x0F;
			} else {
				/* 4-byte sequence: U+10000..U+10FFFF */
				if (byte < 0x90 ? c == 0xF0 : c == 0xF4)
					break; /* < 10000 or > 10FFFF */
				n = 3;
				c &= 0x07;
			}

			for (; n--; byte = *utf8++) {
				if (byte & 0xc0 != 0x80)
					goto done;	/* Double break */
				c = (c << 6) | (byte & 0x3f);
			}
			/* Now find the width of it */
			w = wcwidth(c);
			if (w == -1)
				break;
		}

		/* Part 2: Figure out if it will fit */
		if (width + w > max)
			break;	/* Would exceed space - stop */
		/* Part 3: It fits; update our statistics */
		width += w;
		pos = (unsigned)(utf8 - origin);
	}

done:
	if (actual)
		*actual = width;
	return pos;
}

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] git-blame: Make the output human readable
  2006-03-05 21:28     ` Junio C Hamano
@ 2006-03-07 16:34       ` Fredrik Kuivinen
  0 siblings, 0 replies; 12+ messages in thread
From: Fredrik Kuivinen @ 2006-03-07 16:34 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Fredrik Kuivinen, git, Johannes Schindelin, Ryan Anderson

On Sun, Mar 05, 2006 at 01:28:57PM -0800, Junio C Hamano wrote:
> Fredrik Kuivinen <freku045@student.liu.se> writes:
> 
> > I find the first format easier to read since everything is aligned (we
> > always output 15 characters for the committer's name padded with
> > spaces if necessary and the line numbers are padded appropriately). It
> > also takes up less space on screen since it doesn't use tabs.
> 
> Careful.  The convention for names is to encode them in UTF-8,
> so if you mean 15 "characters" that might be OK but if it can
> truncate in the middle of a multibyte sequence of UTF-8 encoded
> single character it is a no-no.
> 
> We are talking about "casual" aligning, so I would not bring up
> "proportional fonts", but even on a monospace terminal and
> line-printer output, there still is the issue of byte count not
> matching columns.  In an i18nized environment, aligning columns
> by counting bytes is a lost battle.
> 
> In any case, please make sure you do not chop a character in the
> middle at least [*1*].
> 

I hadn't thought of that. The code as it is now will occasionally do
the wrong thing when encountered with multibyte per code point
encodings. If we can assume that the name is in UTF-8 then the code
posted by "linux@horizon.com" to the list should do the trick.

However, can we assume that the name is in UTF-8? Do
i18n.commitencoding only apply to the commit message, or is it
intended to apply to the author/committer information too?

I am beginning to think that it might be best to, by default, store
the author, committer and commit messages in UTF-8. And then
automatically convert from/to the users locale on input/output. (I
just read the old "Rss produced by git is not valid xml?" thread where
this issue was discussed quite a bit. However, I didn't see the 'store
in UTF-8 by default and convert from/to users locale' method anywhere
in that thread.)

- Fredrik

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] git-blame: Make the output human readable
  2006-03-06 19:33 [PATCH] git-blame: Make the output human readable linux
@ 2006-03-08 14:32 ` Sergey Vlasov
  2006-03-08 18:04   ` linux
  0 siblings, 1 reply; 12+ messages in thread
From: Sergey Vlasov @ 2006-03-08 14:32 UTC (permalink / raw)
  To: linux; +Cc: junkio, git

[-- Attachment #1: Type: text/plain, Size: 973 bytes --]

On 6 Mar 2006 14:33:26 -0500 linux@horizon.com wrote:

> Well, getting 15 characters in UTF-8 is easy (just stop before the 16th
> byte for which ((b & 0xc0) != 0x80)), but what about combining characters?
> 
> You've got accents and stuff to worry about.  And the annoying fact that
> Unicode defined accents as suffixes, so you have to go past the 15th
> column to include all of the 
> 
> And then there's that fact that many characters are traditionally
> represented as double-wide forms, even on character terminals.
> 
> See http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c for details
> an an example implementation of wcwidth().
> 
[skip]
> 			/* Now find the width of it */
> 			w = wcwidth(c);

And this won't work, unless you also add that wcwidth() implementation
to git.

The problem is that the wchar_t encoding is not specified anywhere -
glibc uses Unicode for it, but other systems can use whatever they want
(even locale-dependent).

[-- Attachment #2: Type: application/pgp-signature, Size: 190 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] git-blame: Make the output human readable
  2006-03-08 14:32 ` Sergey Vlasov
@ 2006-03-08 18:04   ` linux
  2006-03-08 18:30     ` Sergey Vlasov
  0 siblings, 1 reply; 12+ messages in thread
From: linux @ 2006-03-08 18:04 UTC (permalink / raw)
  To: linux, vsu; +Cc: git, junkio

> And this won't work, unless you also add that wcwidth() implementation
> to git.

That was the general idea.  It is freely usable.

> The problem is that the wchar_t encoding is not specified anywhere -
> glibc uses Unicode for it, but other systems can use whatever they want
> (even locale-dependent).

Why is that a problem?  None of the code mentioned even uses wchar_t.
The code I wrote converts from UTF-8 to straight Unicode, and that's
what Markus Kuhn's wcwidth() expects as an argument.

At no time do we ask the compiler for its opinion on the subject.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] git-blame: Make the output human readable
  2006-03-08 18:04   ` linux
@ 2006-03-08 18:30     ` Sergey Vlasov
  2006-03-08 19:06       ` linux
  0 siblings, 1 reply; 12+ messages in thread
From: Sergey Vlasov @ 2006-03-08 18:30 UTC (permalink / raw)
  To: linux; +Cc: git, junkio

[-- Attachment #1: Type: text/plain, Size: 1004 bytes --]

On Wed, Mar 08, 2006 at 01:04:22PM -0500, linux@horizon.com wrote:
> > And this won't work, unless you also add that wcwidth() implementation
> > to git.
> 
> That was the general idea.  It is freely usable.
> 
> > The problem is that the wchar_t encoding is not specified anywhere -
> > glibc uses Unicode for it, but other systems can use whatever they want
> > (even locale-dependent).
> 
> Why is that a problem?  None of the code mentioned even uses wchar_t.
> The code I wrote converts from UTF-8 to straight Unicode, and that's
> what Markus Kuhn's wcwidth() expects as an argument.

wcwidth() is a standard library function which takes a wchar_t:

http://www.opengroup.org/onlinepubs/009695399/functions/wcwidth.html

It is easy to write a program which assumes that wchar_t is Unicode
without noticing it, because it will work fine with glibc...

So that mk_wcwidth() must be used unconditionally, and not as a
fallback for systems which do not provide wcwidth() in libc.

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] git-blame: Make the output human readable
  2006-03-08 18:30     ` Sergey Vlasov
@ 2006-03-08 19:06       ` linux
  0 siblings, 0 replies; 12+ messages in thread
From: linux @ 2006-03-08 19:06 UTC (permalink / raw)
  To: linux, vsu; +Cc: git, junkio

> So that mk_wcwidth() must be used unconditionally, and not as a
> fallback for systems which do not provide wcwidth() in libc.

Ah, the light dawns!  I now understand your confusion.
That was exactly the idea; apologies for being unclear.

Kind of like the existing issspace(), isdigit(), isalpha(), etc.
implementations in git-compat-util.h to avoid the mysteries and vagaries
of locales.  A UTF-8-only solution is desired.

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2006-03-08 19:06 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-03-06 19:33 [PATCH] git-blame: Make the output human readable linux
2006-03-08 14:32 ` Sergey Vlasov
2006-03-08 18:04   ` linux
2006-03-08 18:30     ` Sergey Vlasov
2006-03-08 19:06       ` linux
  -- strict thread matches above, loose matches on Subject: below --
2006-03-05 11:03 Fredrik Kuivinen
2006-03-05 12:10 ` Junio C Hamano
2006-03-05 12:38   ` Fredrik Kuivinen
2006-03-05 14:23     ` Johannes Schindelin
2006-03-05 21:28     ` Junio C Hamano
2006-03-07 16:34       ` Fredrik Kuivinen
2006-03-05 14:11   ` Johannes Schindelin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).