git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/4] External 'filter' attributes and drivers
@ 2007-04-21 10:40 Junio C Hamano
       [not found] ` <11771520591703-gi t-send-email-junkio@cox.net>
                   ` (7 more replies)
  0 siblings, 8 replies; 22+ messages in thread
From: Junio C Hamano @ 2007-04-21 10:40 UTC (permalink / raw)
  To: git

I know this is controversial, but here is a small four patch
series to let you insert arbitrary external filter in checkin
and checkout codepath.

[PATCH 1/4] Simplify calling of CR/LF conversion routines
[PATCH 2/4] convert.c: restructure the attribute checking part.
[PATCH 3/4] lockfile: record the primary process.
[PATCH 4/4] Add 'filter' attribute and external filter driver definition.


[1/4] is Alex's earlier patch, rebased on top of 'next'.

[3/4] is necessary for the series because otherwise 'git add'
would not work with any external filter, but the change is
applicable to 'master'.

[4/4] is the body of the change.  I wanted to like run-command.h
process spawning infrastructure, but I suspect I did not use it
optimally.  People who were more involved in its evolution
hopefully have suggestions for better use of it.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 1/4] Simplify calling of CR/LF conversion routines
  2007-04-21 10:40 [PATCH 0/4] External 'filter' attributes and drivers Junio C Hamano
       [not found] ` <11771520591703-gi t-send-email-junkio@cox.net>
@ 2007-04-21 10:40 ` Junio C Hamano
  2007-04-21 10:40 ` [PATCH 2/4] convert.c: restructure the attribute checking part Junio C Hamano
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 22+ messages in thread
From: Junio C Hamano @ 2007-04-21 10:40 UTC (permalink / raw)
  To: git

From: Alex Riesen <raa.lkml@gmail.com>

Signed-off-by: Alex Riesen <raa.lkml@gmail.com>
Signed-off-by: Junio C Hamano <junkio@cox.net>
---

 * This is rebased on 'next'

 builtin-apply.c |   18 +++++--------
 cache.h         |    4 +-
 convert.c       |   71 +++++++++++++++++++++++++++----------------------------
 diff.c          |    4 +-
 entry.c         |    7 +----
 sha1_file.c     |    7 ++---
 6 files changed, 51 insertions(+), 60 deletions(-)

diff --git a/builtin-apply.c b/builtin-apply.c
index fd92ef7..ccd342c 100644
--- a/builtin-apply.c
+++ b/builtin-apply.c
@@ -1475,8 +1475,8 @@ static int read_old_data(struct stat *st, const char *path, char **buf_p, unsign
 		}
 		close(fd);
 		nsize = got;
-		nbuf = buf;
-		if (convert_to_git(path, &nbuf, &nsize)) {
+		nbuf = convert_to_git(path, buf, &nsize);
+		if (nbuf) {
 			free(buf);
 			*buf_p = nbuf;
 			*alloc_p = nsize;
@@ -2355,9 +2355,8 @@ static void add_index_file(const char *path, unsigned mode, void *buf, unsigned
 
 static int try_create_file(const char *path, unsigned int mode, const char *buf, unsigned long size)
 {
-	int fd, converted;
+	int fd;
 	char *nbuf;
-	unsigned long nsize;
 
 	if (has_symlinks && S_ISLNK(mode))
 		/* Although buf:size is counted string, it also is NUL
@@ -2369,13 +2368,10 @@ static int try_create_file(const char *path, unsigned int mode, const char *buf,
 	if (fd < 0)
 		return -1;
 
-	nsize = size;
-	nbuf = (char *) buf;
-	converted = convert_to_working_tree(path, &nbuf, &nsize);
-	if (converted) {
+	nbuf = convert_to_working_tree(path, buf, &size);
+	if (nbuf)
 		buf = nbuf;
-		size = nsize;
-	}
+
 	while (size) {
 		int written = xwrite(fd, buf, size);
 		if (written < 0)
@@ -2387,7 +2383,7 @@ static int try_create_file(const char *path, unsigned int mode, const char *buf,
 	}
 	if (close(fd) < 0)
 		die("closing file %s: %s", path, strerror(errno));
-	if (converted)
+	if (nbuf)
 		free(nbuf);
 	return 0;
 }
diff --git a/cache.h b/cache.h
index 38ad006..8c804cb 100644
--- a/cache.h
+++ b/cache.h
@@ -496,8 +496,8 @@ extern void trace_printf(const char *format, ...);
 extern void trace_argv_printf(const char **argv, int count, const char *format, ...);
 
 /* convert.c */
-extern int convert_to_git(const char *path, char **bufp, unsigned long *sizep);
-extern int convert_to_working_tree(const char *path, char **bufp, unsigned long *sizep);
+extern char *convert_to_git(const char *path, const char *src, unsigned long *sizep);
+extern char *convert_to_working_tree(const char *path, const char *src, unsigned long *sizep);
 
 /* match-trees.c */
 void shift_tree(const unsigned char *, const unsigned char *, unsigned char *, int);
diff --git a/convert.c b/convert.c
index da64253..742b895 100644
--- a/convert.c
+++ b/convert.c
@@ -79,25 +79,24 @@ static int is_binary(unsigned long size, struct text_stat *stats)
 	return 0;
 }
 
-static int crlf_to_git(const char *path, char **bufp, unsigned long *sizep, int action)
+static char *crlf_to_git(const char *path, const char *src, unsigned long *sizep, int action)
 {
-	char *buffer, *nbuf;
+	char *buffer, *dst;
 	unsigned long size, nsize;
 	struct text_stat stats;
 
 	if ((action == CRLF_BINARY) || (action == CRLF_GUESS && !auto_crlf))
-		return 0;
+		return NULL;
 
 	size = *sizep;
 	if (!size)
-		return 0;
-	buffer = *bufp;
+		return NULL;
 
-	gather_stats(buffer, size, &stats);
+	gather_stats(src, size, &stats);
 
 	/* No CR? Nothing to convert, regardless. */
 	if (!stats.cr)
-		return 0;
+		return NULL;
 
 	if (action == CRLF_GUESS) {
 		/*
@@ -106,13 +105,13 @@ static int crlf_to_git(const char *path, char **bufp, unsigned long *sizep, int
 		 * stuff?
 		 */
 		if (stats.cr != stats.crlf)
-			return 0;
+			return NULL;
 
 		/*
 		 * And add some heuristics for binary vs text, of course...
 		 */
 		if (is_binary(size, &stats))
-			return 0;
+			return NULL;
 	}
 
 	/*
@@ -120,10 +119,10 @@ static int crlf_to_git(const char *path, char **bufp, unsigned long *sizep, int
 	 * to let the caller know that we switched buffers on it.
 	 */
 	nsize = size - stats.crlf;
-	nbuf = xmalloc(nsize);
-	*bufp = nbuf;
+	buffer = xmalloc(nsize);
 	*sizep = nsize;
 
+	dst = buffer;
 	if (action == CRLF_GUESS) {
 		/*
 		 * If we guessed, we already know we rejected a file with
@@ -131,54 +130,53 @@ static int crlf_to_git(const char *path, char **bufp, unsigned long *sizep, int
 		 * follow it.
 		 */
 		do {
-			unsigned char c = *buffer++;
+			unsigned char c = *src++;
 			if (c != '\r')
-				*nbuf++ = c;
+				*dst++ = c;
 		} while (--size);
 	} else {
 		do {
-			unsigned char c = *buffer++;
+			unsigned char c = *src++;
 			if (! (c == '\r' && (1 < size && *buffer == '\n')))
-				*nbuf++ = c;
+				*dst++ = c;
 		} while (--size);
 	}
 
-	return 1;
+	return buffer;
 }
 
-static int crlf_to_worktree(const char *path, char **bufp, unsigned long *sizep, int action)
+static char *crlf_to_worktree(const char *path, const char *src, unsigned long *sizep, int action)
 {
-	char *buffer, *nbuf;
+	char *buffer, *dst;
 	unsigned long size, nsize;
 	struct text_stat stats;
 	unsigned char last;
 
 	if ((action == CRLF_BINARY) || (action == CRLF_INPUT) ||
 	    (action == CRLF_GUESS && auto_crlf <= 0))
-		return 0;
+		return NULL;
 
 	size = *sizep;
 	if (!size)
-		return 0;
-	buffer = *bufp;
+		return NULL;
 
-	gather_stats(buffer, size, &stats);
+	gather_stats(src, size, &stats);
 
 	/* No LF? Nothing to convert, regardless. */
 	if (!stats.lf)
-		return 0;
+		return NULL;
 
 	/* Was it already in CRLF format? */
 	if (stats.lf == stats.crlf)
-		return 0;
+		return NULL;
 
 	if (action == CRLF_GUESS) {
 		/* If we have any bare CR characters, we're not going to touch it */
 		if (stats.cr != stats.crlf)
-			return 0;
+			return NULL;
 
 		if (is_binary(size, &stats))
-			return 0;
+			return NULL;
 	}
 
 	/*
@@ -186,19 +184,20 @@ static int crlf_to_worktree(const char *path, char **bufp, unsigned long *sizep,
 	 * to let the caller know that we switched buffers on it.
 	 */
 	nsize = size + stats.lf - stats.crlf;
-	nbuf = xmalloc(nsize);
-	*bufp = nbuf;
+	buffer = xmalloc(nsize);
 	*sizep = nsize;
 	last = 0;
+
+	dst = buffer;
 	do {
-		unsigned char c = *buffer++;
+		unsigned char c = *src++;
 		if (c == '\n' && last != '\r')
-			*nbuf++ = '\r';
-		*nbuf++ = c;
+			*dst++ = '\r';
+		*dst++ = c;
 		last = c;
 	} while (--size);
 
-	return 1;
+	return buffer;
 }
 
 static void setup_crlf_check(struct git_attr_check *check)
@@ -231,12 +230,12 @@ static int git_path_check_crlf(const char *path)
 	return CRLF_GUESS;
 }
 
-int convert_to_git(const char *path, char **bufp, unsigned long *sizep)
+char *convert_to_git(const char *path, const char *src, unsigned long *sizep)
 {
-	return crlf_to_git(path, bufp, sizep, git_path_check_crlf(path));
+	return crlf_to_git(path, src, sizep, git_path_check_crlf(path));
 }
 
-int convert_to_working_tree(const char *path, char **bufp, unsigned long *sizep)
+char *convert_to_working_tree(const char *path, const char *src, unsigned long *sizep)
 {
-	return crlf_to_worktree(path, bufp, sizep, git_path_check_crlf(path));
+	return crlf_to_worktree(path, src, sizep, git_path_check_crlf(path));
 }
diff --git a/diff.c b/diff.c
index 5f50186..1cb1230 100644
--- a/diff.c
+++ b/diff.c
@@ -1493,9 +1493,9 @@ int diff_populate_filespec(struct diff_filespec *s, int size_only)
 		/*
 		 * Convert from working tree format to canonical git format
 		 */
-		buf = s->data;
 		size = s->size;
-		if (convert_to_git(s->path, &buf, &size)) {
+		buf = convert_to_git(s->path, s->data, &size);
+		if (buf) {
 			munmap(s->data, s->size);
 			s->should_munmap = 0;
 			s->data = buf;
diff --git a/entry.c b/entry.c
index d72f811..3771209 100644
--- a/entry.c
+++ b/entry.c
@@ -79,7 +79,6 @@ static int write_entry(struct cache_entry *ce, char *path, struct checkout *stat
 	}
 	switch (ntohl(ce->ce_mode) & S_IFMT) {
 		char *buf;
-		unsigned long nsize;
 
 	case S_IFREG:
 		if (to_tempfile) {
@@ -96,12 +95,10 @@ static int write_entry(struct cache_entry *ce, char *path, struct checkout *stat
 		/*
 		 * Convert from git internal format to working tree format
 		 */
-		buf = new;
-		nsize = size;
-		if (convert_to_working_tree(ce->name, &buf, &nsize)) {
+		buf = convert_to_working_tree(ce->name, new, &size);
+		if (buf) {
 			free(new);
 			new = buf;
-			size = nsize;
 		}
 
 		wrote = write_in_full(fd, new, size);
diff --git a/sha1_file.c b/sha1_file.c
index 4304fe9..1978d5f 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -2277,10 +2277,9 @@ int index_fd(unsigned char *sha1, int fd, struct stat *st, int write_object,
 	 */
 	if ((type == OBJ_BLOB) && S_ISREG(st->st_mode)) {
 		unsigned long nsize = size;
-		char *nbuf = buf;
-		if (convert_to_git(path, &nbuf, &nsize)) {
-			if (size)
-				munmap(buf, size);
+		char *nbuf = convert_to_git(path, buf, &nsize);
+		if (nbuf) {
+			munmap(buf, size);
 			size = nsize;
 			buf = nbuf;
 			re_allocated = 1;
-- 
1.5.1.1.948.g9f6f

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 2/4] convert.c: restructure the attribute checking part.
  2007-04-21 10:40 [PATCH 0/4] External 'filter' attributes and drivers Junio C Hamano
       [not found] ` <11771520591703-gi t-send-email-junkio@cox.net>
  2007-04-21 10:40 ` [PATCH 1/4] Simplify calling of CR/LF conversion routines Junio C Hamano
@ 2007-04-21 10:40 ` Junio C Hamano
  2007-04-21 10:40 ` [PATCH 3/4] lockfile: record the primary process Junio C Hamano
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 22+ messages in thread
From: Junio C Hamano @ 2007-04-21 10:40 UTC (permalink / raw)
  To: git

This separates the checkattr() call and interpretation of the
returned value specific to the 'crlf' attribute into separate
routines, so that we can run a single call to checkattr() to
check for more than one attributes, and then interprete what
the returned settings mean separately.

Signed-off-by: Junio C Hamano <junkio@cox.net>
---
 convert.c |   48 ++++++++++++++++++++++++++++--------------------
 1 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/convert.c b/convert.c
index 742b895..37239ac 100644
--- a/convert.c
+++ b/convert.c
@@ -200,7 +200,7 @@ static char *crlf_to_worktree(const char *path, const char *src, unsigned long *
 	return buffer;
 }
 
-static void setup_crlf_check(struct git_attr_check *check)
+static void setup_convert_check(struct git_attr_check *check)
 {
 	static struct git_attr *attr_crlf;
 
@@ -209,33 +209,41 @@ static void setup_crlf_check(struct git_attr_check *check)
 	check->attr = attr_crlf;
 }
 
-static int git_path_check_crlf(const char *path)
+static int git_path_check_crlf(const char *path, struct git_attr_check *check)
 {
-	struct git_attr_check attr_crlf_check;
-
-	setup_crlf_check(&attr_crlf_check);
-
-	if (!git_checkattr(path, 1, &attr_crlf_check)) {
-		const char *value = attr_crlf_check.value;
-		if (ATTR_TRUE(value))
-			return CRLF_TEXT;
-		else if (ATTR_FALSE(value))
-			return CRLF_BINARY;
-		else if (ATTR_UNSET(value))
-			;
-		else if (!strcmp(value, "input"))
-			return CRLF_INPUT;
-		/* fallthru */
-	}
+	const char *value = check->value;
+
+	if (ATTR_TRUE(value))
+		return CRLF_TEXT;
+	else if (ATTR_FALSE(value))
+		return CRLF_BINARY;
+	else if (ATTR_UNSET(value))
+		;
+	else if (!strcmp(value, "input"))
+		return CRLF_INPUT;
 	return CRLF_GUESS;
 }
 
 char *convert_to_git(const char *path, const char *src, unsigned long *sizep)
 {
-	return crlf_to_git(path, src, sizep, git_path_check_crlf(path));
+	struct git_attr_check check[1];
+	int crlf = CRLF_GUESS;
+
+	setup_convert_check(check);
+	if (!git_checkattr(path, 1, check)) {
+		crlf = git_path_check_crlf(path, check);
+	}
+	return crlf_to_git(path, src, sizep, crlf);
 }
 
 char *convert_to_working_tree(const char *path, const char *src, unsigned long *sizep)
 {
-	return crlf_to_worktree(path, src, sizep, git_path_check_crlf(path));
+	struct git_attr_check check[1];
+	int crlf = CRLF_GUESS;
+
+	setup_convert_check(check);
+	if (!git_checkattr(path, 1, check)) {
+		crlf = git_path_check_crlf(path, check);
+	}
+	return crlf_to_worktree(path, src, sizep, crlf);
 }
-- 
1.5.1.1.948.g9f6f

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 3/4] lockfile: record the primary process.
  2007-04-21 10:40 [PATCH 0/4] External 'filter' attributes and drivers Junio C Hamano
                   ` (2 preceding siblings ...)
  2007-04-21 10:40 ` [PATCH 2/4] convert.c: restructure the attribute checking part Junio C Hamano
@ 2007-04-21 10:40 ` Junio C Hamano
  2007-04-21 10:40 ` [PATCH 4/4] Add 'filter' attribute and external filter driver definition Junio C Hamano
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 22+ messages in thread
From: Junio C Hamano @ 2007-04-21 10:40 UTC (permalink / raw)
  To: git

The usual process flow is the main process opens and holds the lock to
the index, does its thing, perhaps spawning children during the course,
and then writes the resulting index out by releaseing the lock.

However, the lockfile interface uses atexit(3) to clean it up, without
regard to who actually created the lock.  This typically leads to a
confusing behaviour of lock being released too early when the child
exits, and then the parent process when it calls commit_lockfile()
finds that it cannot unlock it.

This fixes the problem by recording who created and holds the lock, and
upon atexit(3) handler, child simply ignores the lockfile the parent
created.

Signed-off-by: Junio C Hamano <junkio@cox.net>
---
 cache.h    |    1 +
 lockfile.c |    6 +++++-
 2 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/cache.h b/cache.h
index 8c804cb..faddaf6 100644
--- a/cache.h
+++ b/cache.h
@@ -209,6 +209,7 @@ extern int refresh_cache(unsigned int flags);
 
 struct lock_file {
 	struct lock_file *next;
+	pid_t owner;
 	char on_list;
 	char filename[PATH_MAX];
 };
diff --git a/lockfile.c b/lockfile.c
index bed6b21..23db35a 100644
--- a/lockfile.c
+++ b/lockfile.c
@@ -8,8 +8,11 @@ static const char *alternate_index_output;
 
 static void remove_lock_file(void)
 {
+	pid_t me = getpid();
+
 	while (lock_file_list) {
-		if (lock_file_list->filename[0])
+		if (lock_file_list->owner == me &&
+		    lock_file_list->filename[0])
 			unlink(lock_file_list->filename);
 		lock_file_list = lock_file_list->next;
 	}
@@ -28,6 +31,7 @@ static int lock_file(struct lock_file *lk, const char *path)
 	sprintf(lk->filename, "%s.lock", path);
 	fd = open(lk->filename, O_RDWR | O_CREAT | O_EXCL, 0666);
 	if (0 <= fd) {
+		lk->owner = getpid();
 		if (!lk->on_list) {
 			lk->next = lock_file_list;
 			lock_file_list = lk;
-- 
1.5.1.1.948.g9f6f

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 4/4] Add 'filter' attribute and external filter driver definition.
  2007-04-21 10:40 [PATCH 0/4] External 'filter' attributes and drivers Junio C Hamano
                   ` (3 preceding siblings ...)
  2007-04-21 10:40 ` [PATCH 3/4] lockfile: record the primary process Junio C Hamano
@ 2007-04-21 10:40 ` Junio C Hamano
  2007-04-22  0:39   ` Shawn O. Pearce
                     ` (2 more replies)
  2007-04-21 20:03 ` [PATCH 0/4] External 'filter' attributes and drivers Alex Riesen
                   ` (2 subsequent siblings)
  7 siblings, 3 replies; 22+ messages in thread
From: Junio C Hamano @ 2007-04-21 10:40 UTC (permalink / raw)
  To: git

The interface is similar to the custom low-level merge drivers.

First you configure your filter driver by defining 'filter.<name>.*'
variables in the configuration.

	filter.<name>.clean	filter command to run upon checkin
	filter.<name>.smudge	filter command to run upon checkout

Then you assign filter attribute to each path, whose name
matches the custom filter driver's name.

Example:

	(in .gitattributes)
	*.c	filter=indent

	(in config)
	[filter "indent"]
		clean = indent
		smudge = cat

Signed-off-by: Junio C Hamano <junkio@cox.net>
---
 convert.c         |  253 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 t/t0021-filter.sh |   35 ++++++++
 2 files changed, 279 insertions(+), 9 deletions(-)
 create mode 100755 t/t0021-filter.sh

diff --git a/convert.c b/convert.c
index 37239ac..62d8cee 100644
--- a/convert.c
+++ b/convert.c
@@ -1,5 +1,6 @@
 #include "cache.h"
 #include "attr.h"
+#include "run-command.h"
 
 /*
  * convert.c - convert a file when checking it out and checking it in.
@@ -200,18 +201,208 @@ static char *crlf_to_worktree(const char *path, const char *src, unsigned long *
 	return buffer;
 }
 
+static int filter_buffer(const char *path, const char *src,
+			 unsigned long size, const char *cmd)
+{
+	/*
+	 * Spawn cmd and feed the buffer contents through its stdin.
+	 */
+	struct child_process child_process;
+	int pipe_feed[2];
+	int write_err, status;
+
+	memset(&child_process, 0, sizeof(child_process));
+
+	if (pipe(pipe_feed) < 0) {
+		error("cannot create pipe to run external filter %s", cmd);
+		return 1;
+	}
+
+	child_process.pid = fork();
+	if (child_process.pid < 0) {
+		error("cannot fork to run external filter %s", cmd);
+		close(pipe_feed[0]);
+		close(pipe_feed[1]);
+		return 1;
+	}
+	if (!child_process.pid) {
+		dup2(pipe_feed[0], 0);
+		close(pipe_feed[0]);
+		close(pipe_feed[1]);
+		execlp(cmd, cmd, NULL);
+		return 1;
+	}
+	close(pipe_feed[0]);
+
+	write_err = (write_in_full(pipe_feed[1], src, size) < 0);
+	if (close(pipe_feed[1]))
+		write_err = 1;
+	if (write_err)
+		error("cannot feed the input to external filter %s", cmd);
+
+	status = finish_command(&child_process);
+	if (status)
+		error("external filter %s failed %d", cmd, -status);
+	return (write_err || status);
+}
+
+static char *apply_filter(const char *path, const char *src,
+			  unsigned long *sizep, const char *cmd)
+{
+	/*
+	 * Create a pipeline to have the command filter the buffer's
+	 * contents.
+	 *
+	 * (child --> cmd) --> us
+	 */
+	const int SLOP = 4096;
+	int pipe_feed[2];
+	int status;
+	char *dst;
+	unsigned long dstsize, dstalloc;
+	struct child_process child_process;
+
+	if (!cmd)
+		return NULL;
+
+	memset(&child_process, 0, sizeof(child_process));
+
+	if (pipe(pipe_feed) < 0) {
+		error("cannot create pipe to run external filter %s", cmd);
+		return NULL;
+	}
+
+	child_process.pid = fork();
+	if (child_process.pid < 0) {
+		error("cannot fork to run external filter %s", cmd);
+		close(pipe_feed[0]);
+		close(pipe_feed[1]);
+		return NULL;
+	}
+	if (!child_process.pid) {
+		dup2(pipe_feed[1], 1);
+		close(pipe_feed[0]);
+		close(pipe_feed[1]);
+		exit(filter_buffer(path, src, *sizep, cmd));
+	}
+	close(pipe_feed[1]);
+
+	dstalloc = *sizep;
+	dst = xmalloc(dstalloc);
+	dstsize = 0;
+
+	while (1) {
+		ssize_t numread = xread(pipe_feed[0], dst + dstsize,
+					dstalloc - dstsize);
+
+		if (numread <= 0) {
+			if (!numread)
+				break;
+			error("read from external filter %s failed", cmd);
+			free(dst);
+			dst = NULL;
+			break;
+		}
+		dstsize += numread;
+		if (dstalloc <= dstsize + SLOP) {
+			dstalloc = dstsize + SLOP;
+			dst = xrealloc(dst, dstalloc);
+		}
+	}
+
+	status = finish_command(&child_process);
+	if (status) {
+		error("external filter %s failed %d", cmd, -status);
+		free(dst);
+		dst = NULL;
+	}
+
+	if (dst)
+		*sizep = dstsize;
+	return dst;
+}
+
+static struct convert_driver {
+	const char *name;
+	struct convert_driver *next;
+	char *smudge;
+	char *clean;
+} *user_convert, **user_convert_tail;
+
+static int read_convert_config(const char *var, const char *value)
+{
+	const char *ep, *name;
+	int namelen;
+	struct convert_driver *drv;
+
+	/*
+	 * External conversion drivers are configured using
+	 * "filter.<name>.variable".
+	 */
+	if (prefixcmp(var, "filter.") || (ep = strrchr(var, '.')) == var + 6)
+		return 0;
+	name = var + 7;
+	namelen = ep - name;
+	for (drv = user_convert; drv; drv = drv->next)
+		if (!strncmp(drv->name, name, namelen) && !drv->name[namelen])
+			break;
+	if (!drv) {
+		char *namebuf;
+		drv = xcalloc(1, sizeof(struct convert_driver));
+		namebuf = xmalloc(namelen + 1);
+		memcpy(namebuf, name, namelen);
+		namebuf[namelen] = 0;
+		drv->name = namebuf;
+		drv->next = NULL;
+		*user_convert_tail = drv;
+		user_convert_tail = &(drv->next);
+	}
+
+	ep++;
+
+	/*
+	 * filter.<name>.smudge and filter.<name>.clean specifies
+	 * the command line:
+	 *
+	 *	command-line
+	 *
+	 * The command-line will not be interpolated in any way.
+	 */
+
+	if (!strcmp("smudge", ep)) {
+		if (!value)
+			return error("%s: lacks value", var);
+		drv->smudge = strdup(value);
+		return 0;
+	}
+
+	if (!strcmp("clean", ep)) {
+		if (!value)
+			return error("%s: lacks value", var);
+		drv->clean = strdup(value);
+		return 0;
+	}
+	return 0;
+}
+
 static void setup_convert_check(struct git_attr_check *check)
 {
 	static struct git_attr *attr_crlf;
+	static struct git_attr *attr_filter;
 
-	if (!attr_crlf)
+	if (!attr_crlf) {
 		attr_crlf = git_attr("crlf", 4);
-	check->attr = attr_crlf;
+		attr_filter = git_attr("filter", 6);
+		user_convert_tail = &user_convert;
+		git_config(read_convert_config);
+	}
+	check[0].attr = attr_crlf;
+	check[1].attr = attr_filter;
 }
 
 static int git_path_check_crlf(const char *path, struct git_attr_check *check)
 {
-	const char *value = check->value;
+	const char *value = check[0].value;
 
 	if (ATTR_TRUE(value))
 		return CRLF_TEXT;
@@ -224,26 +415,70 @@ static int git_path_check_crlf(const char *path, struct git_attr_check *check)
 	return CRLF_GUESS;
 }
 
+static struct convert_driver *git_path_check_convert(const char *path,
+					     struct git_attr_check *check)
+{
+	const char *value = check[1].value;
+	struct convert_driver *drv;
+
+	if (ATTR_TRUE(value) || ATTR_FALSE(value) || ATTR_UNSET(value))
+		return NULL;
+	for (drv = user_convert; drv; drv = drv->next)
+		if (!strcmp(value, drv->name))
+			return drv;
+	return NULL;
+}
+
 char *convert_to_git(const char *path, const char *src, unsigned long *sizep)
 {
-	struct git_attr_check check[1];
+	struct git_attr_check check[2];
 	int crlf = CRLF_GUESS;
+	char *filter = NULL;
+	char *buf, *buf2;
 
 	setup_convert_check(check);
-	if (!git_checkattr(path, 1, check)) {
+	if (!git_checkattr(path, 2, check)) {
+		struct convert_driver *drv;
 		crlf = git_path_check_crlf(path, check);
+		drv = git_path_check_convert(path, check);
+		if (drv && drv->clean)
+			filter = drv->clean;
 	}
-	return crlf_to_git(path, src, sizep, crlf);
+
+	buf = apply_filter(path, src, sizep, filter);
+
+	buf2 = crlf_to_git(path, buf ? buf : src, sizep, crlf);
+	if (buf2) {
+		free(buf);
+		buf = buf2;
+	}
+
+	return buf;
 }
 
 char *convert_to_working_tree(const char *path, const char *src, unsigned long *sizep)
 {
-	struct git_attr_check check[1];
+	struct git_attr_check check[2];
 	int crlf = CRLF_GUESS;
+	char *filter = NULL;
+	char *buf, *buf2;
 
 	setup_convert_check(check);
-	if (!git_checkattr(path, 1, check)) {
+	if (!git_checkattr(path, 2, check)) {
+		struct convert_driver *drv;
 		crlf = git_path_check_crlf(path, check);
+		drv = git_path_check_convert(path, check);
+		if (drv && drv->smudge)
+			filter = drv->smudge;
 	}
-	return crlf_to_worktree(path, src, sizep, crlf);
+
+	buf = crlf_to_worktree(path, src, sizep, crlf);
+
+	buf2 = apply_filter(path, buf ? buf : src, sizep, filter);
+	if (buf2) {
+		free(buf);
+		buf = buf2;
+	}
+
+	return buf;
 }
diff --git a/t/t0021-filter.sh b/t/t0021-filter.sh
new file mode 100755
index 0000000..0f4cd05
--- /dev/null
+++ b/t/t0021-filter.sh
@@ -0,0 +1,35 @@
+#!/bin/sh
+
+test_description='external filter conversion'
+
+. ./test-lib.sh
+
+cat <<\EOF >rot13.sh
+tr '[a-zA-Z]' '[n-za-mN-ZA-M]'
+EOF
+chmod +x rot13.sh
+
+test_expect_success setup '
+	git config filter.rot13.smudge ./rot13.sh &&
+	git config filter.rot13.clean ./rot13.sh &&
+
+	echo "*.t filter=rot13" >.gitattributes &&
+
+	{
+	    echo a b c d e f g h i j k l m
+	    echo n o p q r s t u v w x y z
+	} >test &&
+	cat test >test.t &&
+	cat test >test.o &&
+	git add test test.t &&
+	git checkout -- test test.t
+'
+
+test_expect_success check '
+
+	cmp test.o test &&
+	cmp test.o test.t
+
+'
+
+test_done
-- 
1.5.1.1.948.g9f6f

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH 0/4] External 'filter' attributes and drivers
  2007-04-21 10:40 [PATCH 0/4] External 'filter' attributes and drivers Junio C Hamano
                   ` (4 preceding siblings ...)
  2007-04-21 10:40 ` [PATCH 4/4] Add 'filter' attribute and external filter driver definition Junio C Hamano
@ 2007-04-21 20:03 ` Alex Riesen
  2007-04-22  1:19 ` David Lang
  2007-04-22  5:20 ` Shawn O. Pearce
  7 siblings, 0 replies; 22+ messages in thread
From: Alex Riesen @ 2007-04-21 20:03 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

Junio C Hamano, Sat, Apr 21, 2007 12:40:55 +0200:
> 
> [1/4] is Alex's earlier patch, rebased on top of 'next'.
> 

Oh, thanks. I was almost about to resend it.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] Add 'filter' attribute and external filter driver definition.
  2007-04-21 10:40 ` [PATCH 4/4] Add 'filter' attribute and external filter driver definition Junio C Hamano
@ 2007-04-22  0:39   ` Shawn O. Pearce
  2007-04-22  2:15     ` Junio C Hamano
  2007-04-22  1:33   ` David Lang
  2007-04-22  5:47   ` [PATCH 4/4] Add 'filter' attribute and external filter driver definition Linus Torvalds
  2 siblings, 1 reply; 22+ messages in thread
From: Shawn O. Pearce @ 2007-04-22  0:39 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

Junio C Hamano <junkio@cox.net> wrote:
> The interface is similar to the custom low-level merge drivers.
... 
> +static int filter_buffer(const char *path, const char *src,
> +			 unsigned long size, const char *cmd)
> +{
> +	/*
> +	 * Spawn cmd and feed the buffer contents through its stdin.
> +	 */
...

ick.  What about something like this on top?  I moved the extra child
process for the input pipe down into the start_command routine,
where we can do something a little smarter on some systems, like
using a thread rather than a full process.  Its also a shorter
patch and uses more of the run-command API.

Its not documented very well, but if you set child_process.{in,out}
to <0 we open the pipe for you, and close your side of the pipe
for you.  That really simplifies the logic in the callers.

I did consider rewriting this as a select() based loop to feed the
input and read the output, but that might not port well onto a more
native Win32 based set of APIs.

---
diff --git a/convert.c b/convert.c
index 62d8cee..2ba7ea3 100644
--- a/convert.c
+++ b/convert.c
@@ -201,99 +201,37 @@ static char *crlf_to_worktree(const char *path, const char *src, unsigned long *
 	return buffer;
 }
 
-static int filter_buffer(const char *path, const char *src,
-			 unsigned long size, const char *cmd)
-{
-	/*
-	 * Spawn cmd and feed the buffer contents through its stdin.
-	 */
-	struct child_process child_process;
-	int pipe_feed[2];
-	int write_err, status;
-
-	memset(&child_process, 0, sizeof(child_process));
-
-	if (pipe(pipe_feed) < 0) {
-		error("cannot create pipe to run external filter %s", cmd);
-		return 1;
-	}
-
-	child_process.pid = fork();
-	if (child_process.pid < 0) {
-		error("cannot fork to run external filter %s", cmd);
-		close(pipe_feed[0]);
-		close(pipe_feed[1]);
-		return 1;
-	}
-	if (!child_process.pid) {
-		dup2(pipe_feed[0], 0);
-		close(pipe_feed[0]);
-		close(pipe_feed[1]);
-		execlp(cmd, cmd, NULL);
-		return 1;
-	}
-	close(pipe_feed[0]);
-
-	write_err = (write_in_full(pipe_feed[1], src, size) < 0);
-	if (close(pipe_feed[1]))
-		write_err = 1;
-	if (write_err)
-		error("cannot feed the input to external filter %s", cmd);
-
-	status = finish_command(&child_process);
-	if (status)
-		error("external filter %s failed %d", cmd, -status);
-	return (write_err || status);
-}
-
 static char *apply_filter(const char *path, const char *src,
 			  unsigned long *sizep, const char *cmd)
 {
-	/*
-	 * Create a pipeline to have the command filter the buffer's
-	 * contents.
-	 *
-	 * (child --> cmd) --> us
-	 */
 	const int SLOP = 4096;
-	int pipe_feed[2];
+	struct child_process filter_process;
+	const char *filter_argv[] = {cmd, NULL};
 	int status;
 	char *dst;
 	unsigned long dstsize, dstalloc;
-	struct child_process child_process;
 
 	if (!cmd)
 		return NULL;
 
-	memset(&child_process, 0, sizeof(child_process));
-
-	if (pipe(pipe_feed) < 0) {
-		error("cannot create pipe to run external filter %s", cmd);
-		return NULL;
-	}
-
-	child_process.pid = fork();
-	if (child_process.pid < 0) {
-		error("cannot fork to run external filter %s", cmd);
-		close(pipe_feed[0]);
-		close(pipe_feed[1]);
+	memset(&filter_process, 0, sizeof(filter_process));
+	filter_process.in_bufptr = src;
+	filter_process.in_buflen = *sizep;
+	filter_process.out = -1;
+	filter_process.argv = filter_argv;
+	status = start_command(&filter_process);
+	if (status) {
+		error("external filter %s failed %d", cmd, -status);
 		return NULL;
 	}
-	if (!child_process.pid) {
-		dup2(pipe_feed[1], 1);
-		close(pipe_feed[0]);
-		close(pipe_feed[1]);
-		exit(filter_buffer(path, src, *sizep, cmd));
-	}
-	close(pipe_feed[1]);
 
 	dstalloc = *sizep;
 	dst = xmalloc(dstalloc);
 	dstsize = 0;
 
 	while (1) {
-		ssize_t numread = xread(pipe_feed[0], dst + dstsize,
-					dstalloc - dstsize);
+		ssize_t numread = xread(filter_process.out,
+			dst + dstsize, dstalloc - dstsize);
 
 		if (numread <= 0) {
 			if (!numread)
@@ -310,7 +248,7 @@ static char *apply_filter(const char *path, const char *src,
 		}
 	}
 
-	status = finish_command(&child_process);
+	status = finish_command(&filter_process);
 	if (status) {
 		error("external filter %s failed %d", cmd, -status);
 		free(dst);
diff --git a/run-command.c b/run-command.c
index eff523e..72887f8 100644
--- a/run-command.c
+++ b/run-command.c
@@ -20,12 +20,29 @@ int start_command(struct child_process *cmd)
 	int need_in, need_out;
 	int fdin[2], fdout[2];
 
-	need_in = !cmd->no_stdin && cmd->in < 0;
+	need_in = !cmd->no_stdin && (cmd->in_bufptr || cmd->in < 0);
 	if (need_in) {
 		if (pipe(fdin) < 0)
 			return -ERR_RUN_COMMAND_PIPE;
-		cmd->in = fdin[1];
-		cmd->close_in = 1;
+		if (cmd->in_bufptr) {
+			pid_t in_feeder = fork();
+			if (in_feeder < 0) {
+				close_pair(fdin);
+				return -ERR_RUN_COMMAND_PIPE;
+			}
+			if (!in_feeder) {
+				close(fdin[0]);
+				if (write_in_full(fdin[1],
+					cmd->in_bufptr,
+					cmd->in_buflen) != cmd->in_buflen)
+					die("'%s' did not read all input", cmd->argv[0]);
+				exit(0);
+			}
+			close(fdin[1]);
+		} else {
+			cmd->in = fdin[1];
+			cmd->close_in = 1;
+		}
 	}
 
 	need_out = !cmd->no_stdout
diff --git a/run-command.h b/run-command.h
index 3680ef9..7632843 100644
--- a/run-command.h
+++ b/run-command.h
@@ -13,6 +13,8 @@ enum {
 
 struct child_process {
 	const char **argv;
+	const char *in_bufptr;
+	size_t in_buflen;
 	pid_t pid;
 	int in;
 	int out;
-- 
1.5.1.1.135.gf948


-- 
Shawn.

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH 0/4] External 'filter' attributes and drivers
  2007-04-21 10:40 [PATCH 0/4] External 'filter' attributes and drivers Junio C Hamano
                   ` (5 preceding siblings ...)
  2007-04-21 20:03 ` [PATCH 0/4] External 'filter' attributes and drivers Alex Riesen
@ 2007-04-22  1:19 ` David Lang
  2007-04-22  5:20 ` Shawn O. Pearce
  7 siblings, 0 replies; 22+ messages in thread
From: David Lang @ 2007-04-22  1:19 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Sat, 21 Apr 2007, Junio C Hamano wrote:

> I know this is controversial, but here is a small four patch
> series to let you insert arbitrary external filter in checkin
> and checkout codepath.
>
> [PATCH 1/4] Simplify calling of CR/LF conversion routines
> [PATCH 2/4] convert.c: restructure the attribute checking part.
> [PATCH 3/4] lockfile: record the primary process.
> [PATCH 4/4] Add 'filter' attribute and external filter driver definition.
>
>
> [1/4] is Alex's earlier patch, rebased on top of 'next'.
>
> [3/4] is necessary for the series because otherwise 'git add'
> would not work with any external filter, but the change is
> applicable to 'master'.
>
> [4/4] is the body of the change.  I wanted to like run-command.h
> process spawning infrastructure, but I suspect I did not use it
> optimally.  People who were more involved in its evolution
> hopefully have suggestions for better use of it.

thanks for doing this.

from other discussions on the subject I was under the impression that in some 
cases git commands would look at the file in the working directory instead of 
extracing it from the object blob if the index and timestamp indicated that the 
file had not been changed.

with an external filter in use this is not a safe thing to do and that needs to 
be disabled if gitattributes specifies an external filter (and possibly if 
gitattributes specifies crlf)

this brings up a potential performance problem. if you have to look for 
.gitattributes in all parent directories for each file manipulation tha may want 
to use the working tree optimization, you can end up with a lot fo time wasted 
in just looking up non-existant files (or in finding files and parseing them). 
Apache has this problem with their .htaccess files wher eyou can get a pretty 
substantial performance boost by disabling them. it may be worth it to be able 
to not use .gitattribute files, but only use the repository-wide gitattributes 
file.

Finally, a how-to question

in my case I have two uses for this

1. a permission preserving filter.

  This records the permissions on checking, sets them on checkout

2. host specific file munging for config files

   I have a XML file that specifies what munging should be done for each host and 
a perl script that does this munging

in both of these cases I have a config for the filter (the list of permissions 
or the xml file defining the munging) that should be checked into the repository 
as part of the commit

the problems I see are

1. the config file needs to be checked out before any files that go through the 
filter so that the filter has it available to use

2. when checking in files there is a potential of having the config file change 
between one addition to the index and another. Part of this is in the "well, 
don't do that then catagory" of things, but for things like the permission 
filter this is harder to do

if there is one master permission file then each file checkin could potentially 
add/modify a line in it. as long as lines related to other files aren't changed 
you could re-add it to the index, overriding the prior version.

if you do one permission file per file you are tracking the permissions of (say 
.permissions.$file) then you don't have to worry about different versions of the 
file, but you still need to have the filter (which is running as part of a 
checking) do a checkin of another file. This approach would also make it harder 
to know which files need to be checked out before the filter can run.

any thoughts on how to deal with these issues?

David Lang

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] Add 'filter' attribute and external filter driver  definition.
  2007-04-21 10:40 ` [PATCH 4/4] Add 'filter' attribute and external filter driver definition Junio C Hamano
  2007-04-22  0:39   ` Shawn O. Pearce
@ 2007-04-22  1:33   ` David Lang
  2007-04-22  6:33     ` Junio C Hamano
  2007-04-22  5:47   ` [PATCH 4/4] Add 'filter' attribute and external filter driver definition Linus Torvalds
  2 siblings, 1 reply; 22+ messages in thread
From: David Lang @ 2007-04-22  1:33 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Sat, 21 Apr 2007, Junio C Hamano wrote:

> The interface is similar to the custom low-level merge drivers.
>
> First you configure your filter driver by defining 'filter.<name>.*'
> variables in the configuration.
>
> 	filter.<name>.clean	filter command to run upon checkin
> 	filter.<name>.smudge	filter command to run upon checkout
>
> Then you assign filter attribute to each path, whose name
> matches the custom filter driver's name.
>
> Example:
>
> 	(in .gitattributes)
> 	*.c	filter=indent
>
> 	(in config)
> 	[filter "indent"]
> 		clean = indent
> 		smudge = cat


hmm, three things come to mind here

1. it would be useful in many cases for the filter program to know what file 
it's working on (and probably some other things), so there are probably some 
command-line arguments that should be able to be passed to the filter.

2. should this be done as a modification of the in-memory buffer (s this patch 
does it?) or should it be done at the time of the read/write, makeing the filter 
be responsible for actually doing the disk I/O, which would give it the benifit 
of being able to do things like set permissions and other things that can't be 
done until the file is actually on the filesystem (for something managing config 
files, this could include restarting the daemon related to the config file for 
example)

3. why specify seperate clean/smudge programs instead of just one script with a 
read/write parameter? I suspect that in most cases the external filter program 
that cleans files will be the same one that smudges them. the clean/smudge 
version does let you specify vastly different things without requireing a 
wrapper script around them, but it would mean duplicating the line when they are 
the same.

the first two items seem fairly important to me, but the third is a niceity that 
I could live with as-is.

David Lang

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] Add 'filter' attribute and external filter driver definition.
  2007-04-22  0:39   ` Shawn O. Pearce
@ 2007-04-22  2:15     ` Junio C Hamano
  2007-04-22  3:00       ` Shawn O. Pearce
  0 siblings, 1 reply; 22+ messages in thread
From: Junio C Hamano @ 2007-04-22  2:15 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: git

"Shawn O. Pearce" <spearce@spearce.org> writes:

> ick.  What about something like this on top?  I moved the extra child
> process for the input pipe down into the start_command routine,
> where we can do something a little smarter on some systems, like
> using a thread rather than a full process.  Its also a shorter
> patch and uses more of the run-command API.

Well, I did not like start_command() that wanted to always
perform the full exec of something else for its inflexibility,
and this piles a specific hack on top of it...  Why not a
callback with void * pointer?

Or are you trying to make this interface as inflexible and
feature-limited as possible, perhaps to make it easier to
porting to Windows?

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] Add 'filter' attribute and external filter driver definition.
  2007-04-22  2:15     ` Junio C Hamano
@ 2007-04-22  3:00       ` Shawn O. Pearce
  0 siblings, 0 replies; 22+ messages in thread
From: Shawn O. Pearce @ 2007-04-22  3:00 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

Junio C Hamano <junkio@cox.net> wrote:
> Well, I did not like start_command() that wanted to always
> perform the full exec of something else for its inflexibility,
> and this piles a specific hack on top of it...  Why not a
> callback with void * pointer?

Well, that's because its always used to execute some external
program.  And some operating system designers once upon a time
thought that was the only way anyone would ever need to start a
new parallel thread of execution.  ;-)

But why do you want a callback here in start_command() given
that all you are doing is running a filter command anyway?
Is that so you could start a "thread" to handle the stdin
pipe?

-- 
Shawn.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 0/4] External 'filter' attributes and drivers
  2007-04-21 10:40 [PATCH 0/4] External 'filter' attributes and drivers Junio C Hamano
                   ` (6 preceding siblings ...)
  2007-04-22  1:19 ` David Lang
@ 2007-04-22  5:20 ` Shawn O. Pearce
  2007-04-22  9:01   ` David Lang
  7 siblings, 1 reply; 22+ messages in thread
From: Shawn O. Pearce @ 2007-04-22  5:20 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

Junio C Hamano <junkio@cox.net> wrote:
> I know this is controversial, but here is a small four patch
> series to let you insert arbitrary external filter in checkin
> and checkout codepath.

This series is some pretty nice work.

But I really don't think we want filters.  Actually, I'm very
against them, and I'm actually very against the CRLF work that
has already been added.  Since the CRLF ship has sailed I won't
try to call it back to port.  But I don't want to see the filter
stuff raise anchor...

I've only really seen a few arguments for the filters:

1) Better compress structured content (e.g. ODF) by storing the
   ZIP as a tree, allowing normal deltification within packfiles
   to apply to the contained files.

2) Use a custom diff function on special files (e.g. ODF) as they
   are otherwise unreadable with the internal xdiff based engine.

3) Mutate content prior to extracting from the tree, e.g. printing.


Let me try to address these points.

#1: There are a limited number of content formats that we could
reasonably filter into the repository such that the standard
deltification routines will have good space/performance benefits.
Most of them today are ZIP archives (e.g. ODF, JAR).

Why don't we just teach the packfile format how to better compress
these types of streams?  Let read_sha1_file() and pack-objects do all
of the heavy translation work, just as they do today for text files.
Explode them into a "tree-like" thing that allows deltification
against any other content (even cross ZIP streams) just like we
do with trees, but always expose them to the working directory
level of the system as blobs.

This way we never get into the mess that David Lang pointed out
where we have many optimizations that reuse working tree files when
stat data matches; nor do we have to worry about major structural
differences between the working tree (1 file) and the repository
format (exploded ZIP as 10,000 files).


#2: We already support using any diff tool you want: set the
GIT_EXTERNAL_DIFF environment variable before running a program that
generates a diff.  As Junio pointed out on #git tonight, that could
be any shell script that decides how to produce the diff based on
its own logic.  Though we could also use the new attribute stuff
to select diff programs, much like we do now for merge conflict
resolution in merge-recursive.


#3: This has already been discussed at length on the list.
Letting the build system perform this sort of work is better than
making the VCS do it; especially when you want the VCS to do its
sole job well (track the state of the working directory) and the
build system do its sole job well (produce files suitable for use
outside of the repository).


So despite the fact that I tried to make 4/4 shorter, I really
don't think we should be doing this...

-- 
Shawn.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] Add 'filter' attribute and external filter driver definition.
  2007-04-21 10:40 ` [PATCH 4/4] Add 'filter' attribute and external filter driver definition Junio C Hamano
  2007-04-22  0:39   ` Shawn O. Pearce
  2007-04-22  1:33   ` David Lang
@ 2007-04-22  5:47   ` Linus Torvalds
  2007-04-22  6:12     ` Junio C Hamano
  2 siblings, 1 reply; 22+ messages in thread
From: Linus Torvalds @ 2007-04-22  5:47 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git



On Sat, 21 Apr 2007, Junio C Hamano wrote:
>
> The interface is similar to the custom low-level merge drivers.
> 
> First you configure your filter driver by defining 'filter.<name>.*'
> variables in the configuration.
> 
> 	filter.<name>.clean	filter command to run upon checkin
> 	filter.<name>.smudge	filter command to run upon checkout

I have to say, I'm obviously not a huge fan of playing games, but the 
diffs are very clean.

Are they actually *useful?* I dunno. I'm a bit nervous about what this 
means for any actual user of the feature, but I have to admit to being 
charmed by a clean implementation. 

I suspect that this gets some complaining off our back, but I *also* 
suspect that people will actually end up really screwing themselves with 
something like this and then blaming us and causing a huge pain down the 
line when we've supported this and people want "extended semantics" that 
are no longer clean.

But I'm not sure how valid an argument that really is. I do happen to 
believe in the "give them rope" philosophy. I think you can probably screw 
yourself royally with this, but hey, anybody who does that only has 
himself to blame ...

		Linus

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] Add 'filter' attribute and external filter driver definition.
  2007-04-22  5:47   ` [PATCH 4/4] Add 'filter' attribute and external filter driver definition Linus Torvalds
@ 2007-04-22  6:12     ` Junio C Hamano
  0 siblings, 0 replies; 22+ messages in thread
From: Junio C Hamano @ 2007-04-22  6:12 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: git

Linus Torvalds <torvalds@linux-foundation.org> writes:

> I suspect that this gets some complaining off our back, but I *also* 
> suspect that people will actually end up really screwing themselves with 
> something like this and then blaming us and causing a huge pain down the 
> line when we've supported this and people want "extended semantics" that 
> are no longer clean.
>
> But I'm not sure how valid an argument that really is. I do happen to 
> believe in the "give them rope" philosophy. I think you can probably screw 
> yourself royally with this, but hey, anybody who does that only has 
> himself to blame ...

That's exactly my argument when I had an exchange with Nico on
this thread.  I haven't decided if I want to make them (I sent
out a replacement patch for the tip one, and added another one)
merged in 'next' yet for that exact reason.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] Add 'filter' attribute and external filter driver  definition.
  2007-04-22  1:33   ` David Lang
@ 2007-04-22  6:33     ` Junio C Hamano
  2007-04-22  9:09       ` David Lang
  0 siblings, 1 reply; 22+ messages in thread
From: Junio C Hamano @ 2007-04-22  6:33 UTC (permalink / raw)
  To: David Lang; +Cc: git

David Lang <david.lang@digitalinsight.com> writes:

> 1. it would be useful in many cases for the filter program to know
> what file it's working on (and probably some other things), so there
> are probably some command-line arguments that should be able to be
> passed to the filter.

I can see that you missed the class when Linus talked about how
messy things would get once you allow the conversion to be
stateful.  I was in the class and remembered it ;-)

Although I initially considered interpolating "%P" with
pathname, I ended up deciding against it, to discourage people
from abusing the filter for stateful conversion that changes the
results depending on time, pathname, commit, branch and stuff.

> 2. should this be done as a modification of the in-memory buffer (s
> this patch does it?) or should it be done at the time of the
> read/write, makeing the filter be responsible for actually doing the
> disk I/O, which would give it the benifit of being able to do things
> like set permissions and other things ...

The conversion is not about overriding the mode bits recorded in
tree objects, nor making git as a replacement for build procedure.

> 3. why specify seperate clean/smudge programs instead of just one
> script with a read/write parameter?

I think the most common two ways have clean as a cleaner and
smudge as a no-op (similar to crlf=input conversion), or clean
and smudge are inverse operations (similar to crlf=true
conversion.  I do not see a sane case where clean and smudge are
the same, unless you are thinking about the toy demonstration
test piece I added to t0021 which uses rot13 as both clean and
smudge filters.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 0/4] External 'filter' attributes and drivers
  2007-04-22  5:20 ` Shawn O. Pearce
@ 2007-04-22  9:01   ` David Lang
  0 siblings, 0 replies; 22+ messages in thread
From: David Lang @ 2007-04-22  9:01 UTC (permalink / raw)
  To: Shawn O. Pearce; +Cc: Junio C Hamano, git

On Sun, 22 Apr 2007, Shawn O. Pearce wrote:

> Junio C Hamano <junkio@cox.net> wrote:
>> I know this is controversial, but here is a small four patch
>> series to let you insert arbitrary external filter in checkin
>> and checkout codepath.
>
> This series is some pretty nice work.
>
> But I really don't think we want filters.  Actually, I'm very
> against them, and I'm actually very against the CRLF work that
> has already been added.  Since the CRLF ship has sailed I won't
> try to call it back to port.  But I don't want to see the filter
> stuff raise anchor...
>
> I've only really seen a few arguments for the filters:
>
> 1) Better compress structured content (e.g. ODF) by storing the
>   ZIP as a tree, allowing normal deltification within packfiles
>   to apply to the contained files.
>
> 2) Use a custom diff function on special files (e.g. ODF) as they
>   are otherwise unreadable with the internal xdiff based engine.
>
> 3) Mutate content prior to extracting from the tree, e.g. printing.
>
>
> Let me try to address these points.
>
> #1: There are a limited number of content formats that we could
> reasonably filter into the repository such that the standard
> deltification routines will have good space/performance benefits.
> Most of them today are ZIP archives (e.g. ODF, JAR).

I think that there are a lot more that could benifit from application specific 
knowledge. git today just deals with line-based formats. anything that has more 
structure to it could benifit from a diff/merge/delta/etc specific to it

> Why don't we just teach the packfile format how to better compress
> these types of streams?  Let read_sha1_file() and pack-objects do all
> of the heavy translation work, just as they do today for text files.
> Explode them into a "tree-like" thing that allows deltification
> against any other content (even cross ZIP streams) just like we
> do with trees, but always expose them to the working directory
> level of the system as blobs.
>
> This way we never get into the mess that David Lang pointed out
> where we have many optimizations that reuse working tree files when
> stat data matches; nor do we have to worry about major structural
> differences between the working tree (1 file) and the repository
> format (exploded ZIP as 10,000 files).
>

nobody is suggesting that the working tree format be one file and the repository 
format be 1000 files. but if you think that skipping borrowing from a working 
tree is expensive, consider how expsndive it is to uncompress and recompress 
files.

>
> #2: We already support using any diff tool you want: set the
> GIT_EXTERNAL_DIFF environment variable before running a program that
> generates a diff.  As Junio pointed out on #git tonight, that could
> be any shell script that decides how to produce the diff based on
> its own logic.  Though we could also use the new attribute stuff
> to select diff programs, much like we do now for merge conflict
> resolution in merge-recursive.

but not diff tools per file, only per repository. not all files in a repository 
should be handled the same way.

>
> #3: This has already been discussed at length on the list.
> Letting the build system perform this sort of work is better than
> making the VCS do it; especially when you want the VCS to do its
> sole job well (track the state of the working directory) and the
> build system do its sole job well (produce files suitable for use
> outside of the repository).

sometimes there isn't a build system to shove this work off to.

I'll add a few more

managing permissions on files

managing system config files (where you want many systems to share what's 
logicly the same config, but each system needs it's own specifics)

David Lang

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] Add 'filter' attribute and external filter driver  definition.
  2007-04-22  6:33     ` Junio C Hamano
@ 2007-04-22  9:09       ` David Lang
  2007-04-22  9:20         ` David Lang
                           ` (2 more replies)
  0 siblings, 3 replies; 22+ messages in thread
From: David Lang @ 2007-04-22  9:09 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Sat, 21 Apr 2007, Junio C Hamano wrote:

> David Lang <david.lang@digitalinsight.com> writes:
>
>> 1. it would be useful in many cases for the filter program to know
>> what file it's working on (and probably some other things), so there
>> are probably some command-line arguments that should be able to be
>> passed to the filter.
>
> I can see that you missed the class when Linus talked about how
> messy things would get once you allow the conversion to be
> stateful.  I was in the class and remembered it ;-)
>
> Although I initially considered interpolating "%P" with
> pathname, I ended up deciding against it, to discourage people
> from abusing the filter for stateful conversion that changes the
> results depending on time, pathname, commit, branch and stuff.

I didn't miss it, I just don't think that the path in the repository is 
nessasarily as dangerous as the other things (time, branch, etc)

one thing that was listed as a possibilty was to use the sha1 of the file, but 
you would force the filter to calculate that itself. it's already available when 
extracting and recalcuating it is a waste

>> 2. should this be done as a modification of the in-memory buffer (s
>> this patch does it?) or should it be done at the time of the
>> read/write, makeing the filter be responsible for actually doing the
>> disk I/O, which would give it the benifit of being able to do things
>> like set permissions and other things ...
>
> The conversion is not about overriding the mode bits recorded in
> tree objects, nor making git as a replacement for build procedure.

what build procedures?

I'm talking about doing things like managing files in /etc

git doesn't have all the hooks to be able to set the permissions when you 
extract a file, but if the filters were actual readers/writers instead of 
in-memory operators, this becomes trivial to implement with no further changes 
to git itself

>> 3. why specify seperate clean/smudge programs instead of just one
>> script with a read/write parameter?
>
> I think the most common two ways have clean as a cleaner and
> smudge as a no-op (similar to crlf=input conversion), or clean
> and smudge are inverse operations (similar to crlf=true
> conversion.  I do not see a sane case where clean and smudge are
> the same, unless you are thinking about the toy demonstration
> test piece I added to t0021 which uses rot13 as both clean and
> smudge filters.

actually, I'm thinking of much more complicated filters, where it's easier to 
have one program do both functions then it is to have two seperate programs 
(like tar -c /tar -x)

David Lang

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] Add 'filter' attribute and external filter driver  definition.
  2007-04-22  9:09       ` David Lang
@ 2007-04-22  9:20         ` David Lang
  2007-04-22 17:42         ` Junio C Hamano
  2007-04-22 18:11         ` Nicolas Pitre
  2 siblings, 0 replies; 22+ messages in thread
From: David Lang @ 2007-04-22  9:20 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Sun, 22 Apr 2007, David Lang wrote:

> On Sat, 21 Apr 2007, Junio C Hamano wrote:
>
>> David Lang <david.lang@digitalinsight.com> writes:
>> 
>>> 1. it would be useful in many cases for the filter program to know
>>> what file it's working on (and probably some other things), so there
>>> are probably some command-line arguments that should be able to be
>>> passed to the filter.
>> 
>> I can see that you missed the class when Linus talked about how
>> messy things would get once you allow the conversion to be
>> stateful.  I was in the class and remembered it ;-)
>> 
>> Although I initially considered interpolating "%P" with
>> pathname, I ended up deciding against it, to discourage people
>> from abusing the filter for stateful conversion that changes the
>> results depending on time, pathname, commit, branch and stuff.
>
> I didn't miss it, I just don't think that the path in the repository is 
> nessasarily as dangerous as the other things (time, branch, etc)

to clarify a bit more. I have a perl program that I can point at the 
'interesting' files on my systems and have it create a 'generic' version of that 
file. I can then take that generic version of the file to any machine in the 
cluster and with the same program create a version of that generic file that's 
correct for the other system. however to know which substatutions are 
appropriate to do for the file, it needs to know the filename (well, I guess I 
could create a whole bunch of seperate config files, and then define all the 
files with different filters, each filter including the config file to use fo 
rthat specific file, but this seems like a really ugly way to do it)

> one thing that was listed as a possibilty was to use the sha1 of the file, 
> but you would force the filter to calculate that itself. it's already 
> available when extracting and recalcuating it is a waste

ignore this comment, I see you posted an example of this.

David Lang

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] Add 'filter' attribute and external filter driver  definition.
  2007-04-22  9:09       ` David Lang
  2007-04-22  9:20         ` David Lang
@ 2007-04-22 17:42         ` Junio C Hamano
  2007-04-22 21:05           ` David Lang
  2007-04-22 18:11         ` Nicolas Pitre
  2 siblings, 1 reply; 22+ messages in thread
From: Junio C Hamano @ 2007-04-22 17:42 UTC (permalink / raw)
  To: David Lang; +Cc: git

David Lang <david.lang@digitalinsight.com> writes:

>> The conversion is not about overriding the mode bits recorded in
>> tree objects, nor making git as a replacement for build procedure.
>
> what build procedures?
>
> I'm talking about doing things like managing files in /etc

That's exactly what "build procedure" is about.  Nobody sane
should be managing /etc files *directly* under any SCM.  A saner
practice is to have a separate source area with Makefile to
regenerate /etc files, so that (1) "make" manages the target
host specific customizations, (2) "make diff" can be used to
compare and sanity check the result of "make" against what is
currently installed under /etc, and (3) "make install" manages
the permission bits.  The same applies to dotfiles under $HOME/.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] Add 'filter' attribute and external filter driver definition.
  2007-04-22  9:09       ` David Lang
  2007-04-22  9:20         ` David Lang
  2007-04-22 17:42         ` Junio C Hamano
@ 2007-04-22 18:11         ` Nicolas Pitre
  2007-04-22 20:27           ` [PATCH 4/4] Add 'filter' attribute and external filter driverdefinition David Lang
  2 siblings, 1 reply; 22+ messages in thread
From: Nicolas Pitre @ 2007-04-22 18:11 UTC (permalink / raw)
  To: David Lang; +Cc: Junio C Hamano, git

On Sun, 22 Apr 2007, David Lang wrote:

> On Sat, 21 Apr 2007, Junio C Hamano wrote:
> 
> > > 3. why specify seperate clean/smudge programs instead of just one
> > > script with a read/write parameter?
> > 
> > I think the most common two ways have clean as a cleaner and
> > smudge as a no-op (similar to crlf=input conversion), or clean
> > and smudge are inverse operations (similar to crlf=true
> > conversion.  I do not see a sane case where clean and smudge are
> > the same, unless you are thinking about the toy demonstration
> > test piece I added to t0021 which uses rot13 as both clean and
> > smudge filters.
> 
> actually, I'm thinking of much more complicated filters, where it's easier to
> have one program do both functions then it is to have two seperate programs
> (like tar -c /tar -x)

Just specify the same program in both entries with the appropriate 
parameter and be happy.

It is much easier to have two entries with the same program than having 
only one entry when you actually have two separate programs.


Nicolas

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] Add 'filter' attribute and external filter  driverdefinition.
  2007-04-22 18:11         ` Nicolas Pitre
@ 2007-04-22 20:27           ` David Lang
  0 siblings, 0 replies; 22+ messages in thread
From: David Lang @ 2007-04-22 20:27 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: Junio C Hamano, git

On Sun, 22 Apr 2007, Nicolas Pitre wrote:

> On Sun, 22 Apr 2007, David Lang wrote:
>
>> On Sat, 21 Apr 2007, Junio C Hamano wrote:
>>
>>>> 3. why specify seperate clean/smudge programs instead of just one
>>>> script with a read/write parameter?
>>>
>>> I think the most common two ways have clean as a cleaner and
>>> smudge as a no-op (similar to crlf=input conversion), or clean
>>> and smudge are inverse operations (similar to crlf=true
>>> conversion.  I do not see a sane case where clean and smudge are
>>> the same, unless you are thinking about the toy demonstration
>>> test piece I added to t0021 which uses rot13 as both clean and
>>> smudge filters.
>>
>> actually, I'm thinking of much more complicated filters, where it's easier to
>> have one program do both functions then it is to have two seperate programs
>> (like tar -c /tar -x)
>
> Just specify the same program in both entries with the appropriate
> parameter and be happy.
>
> It is much easier to have two entries with the same program than having
> only one entry when you actually have two separate programs.

agreed, this is why I said that this was a fairly minor thing.

David Lang

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] Add 'filter' attribute and external filter driver  definition.
  2007-04-22 17:42         ` Junio C Hamano
@ 2007-04-22 21:05           ` David Lang
  0 siblings, 0 replies; 22+ messages in thread
From: David Lang @ 2007-04-22 21:05 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: git

On Sun, 22 Apr 2007, Junio C Hamano wrote:

> David Lang <david.lang@digitalinsight.com> writes:
>
>>> The conversion is not about overriding the mode bits recorded in
>>> tree objects, nor making git as a replacement for build procedure.
>>
>> what build procedures?
>>
>> I'm talking about doing things like managing files in /etc
>
> That's exactly what "build procedure" is about.  Nobody sane
> should be managing /etc files *directly* under any SCM.  A saner
> practice is to have a separate source area with Makefile to
> regenerate /etc files, so that (1) "make" manages the target
> host specific customizations, (2) "make diff" can be used to
> compare and sanity check the result of "make" against what is
> currently installed under /etc, and (3) "make install" manages
> the permission bits.  The same applies to dotfiles under $HOME/.

this I disagree with this, simply becouse it discourages people from useing a 
SCM for these things due to all the other work that needs to be done

doing it this way you need to
create a new work area
create a build system
find all the tools that change the files and change them to work on the files in 
the work area (and note that some of these tools are distro provided that you 
will have to fork, replace, or live without)
change all sysadmin work habits to use the new work area
not to mention, pick up the pieces when someone/something makes changes to the 
files directly.

by comparison, working in /etc directly all you need to do is to train the 
sysadmins to do a commit after they make changes, and how to revert to the 
prior version if things break (and then train the senior sysadmins to do the 
checkouts of specific versions, resolve merges, and other 'advanced' things to 
pickup the pices when things get messed up)

in fact, you can even get a lot of utility out of a daily cron job to do a 
commit and teaching a few people about git while the remainder don't even know 
that it's running.

everything above is just dealing with the simple case of managing the config 
files on a single system

now, I agree that there are advantages to not working directly in /etc and 
instead working in a seperate area with a build system to actually modify the 
files, but getting to that point is a lot of work, and getting everyone 
(including management) convinced that it's worth that much up-front effort for 
the long-term benifits is hard. and if the sysadmins are under water currently, 
it won't happen becouse they can't let other things fall apart while they go and 
build this new system.

I think this is a case of perfect being the enemy of better.

the simple case of handling files in /etc/directly is so non-intrusive that 
distros could ship it with a nightly cron job enabled by default (ideally the 
cron job would detect that no changes were made and not do a commit) and once 
you have a SCM in place managing the files by default it's much more likely that 
sysadmin tools will start to be changed to work with it directly.

This should not be a huge change for git to suppor this either (although I admit 
it is a change)

with filters enabled at all you need to be able to disable any optimizations 
that use the working tree instead of opeing up an object/pack

so the remaining changes are

1. define a dependancy that allows for filter config files to be extracted 
before the filter is run on checkout

2. allow the filter to do git-add of a file while it is running on git-add (or 
otherwise tell git that a file needs to be added before a commit is done). I 
don't know if this would work today, or if there are locks that would cause a 
deadlock (I suspect a deadlock from comments about git-lib needing to be made 
re-entrent)

3. move the filter from a pipe/pipe model to a pipe/file model (it's fair to 
require the filters to support - as a filename for stdin/stdout git needs this, 
it's fairly standard for unix utilities to do this)

David Lang

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2007-04-22 21:39 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-04-21 10:40 [PATCH 0/4] External 'filter' attributes and drivers Junio C Hamano
     [not found] ` <11771520591703-gi t-send-email-junkio@cox.net>
2007-04-21 10:40 ` [PATCH 1/4] Simplify calling of CR/LF conversion routines Junio C Hamano
2007-04-21 10:40 ` [PATCH 2/4] convert.c: restructure the attribute checking part Junio C Hamano
2007-04-21 10:40 ` [PATCH 3/4] lockfile: record the primary process Junio C Hamano
2007-04-21 10:40 ` [PATCH 4/4] Add 'filter' attribute and external filter driver definition Junio C Hamano
2007-04-22  0:39   ` Shawn O. Pearce
2007-04-22  2:15     ` Junio C Hamano
2007-04-22  3:00       ` Shawn O. Pearce
2007-04-22  1:33   ` David Lang
2007-04-22  6:33     ` Junio C Hamano
2007-04-22  9:09       ` David Lang
2007-04-22  9:20         ` David Lang
2007-04-22 17:42         ` Junio C Hamano
2007-04-22 21:05           ` David Lang
2007-04-22 18:11         ` Nicolas Pitre
2007-04-22 20:27           ` [PATCH 4/4] Add 'filter' attribute and external filter driverdefinition David Lang
2007-04-22  5:47   ` [PATCH 4/4] Add 'filter' attribute and external filter driver definition Linus Torvalds
2007-04-22  6:12     ` Junio C Hamano
2007-04-21 20:03 ` [PATCH 0/4] External 'filter' attributes and drivers Alex Riesen
2007-04-22  1:19 ` David Lang
2007-04-22  5:20 ` Shawn O. Pearce
2007-04-22  9:01   ` David Lang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).