Buffer overflows

git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* Buffer overflows
@ 2007-08-30 19:26 Timo Sirainen
  2007-08-30 20:26 ` Lukas Sandström
                   ` (2 more replies)
  0 siblings, 3 replies; 26+ messages in thread
From: Timo Sirainen @ 2007-08-30 19:26 UTC (permalink / raw)
  To: git


[-- Attachment #1.1: Type: text/plain, Size: 1497 bytes --]

Looks like nothing has happened since my last mail about this
(http://marc.info/?l=git&m=117962988804430&w=2).

I sure hope no-one's using git-mailinfo to do any kind of automated mail
processing from untrusted users. Here's one way to cause it to overflow
a buffer in stack:

Subject: =?iso-8859-15?b?pKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSk
 pKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSk
 pKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSk
 pKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKSkpKQ=?=

It's not the only way. I just accidentally hit that when trying to
verify another buffer overflow. Just look for strcpy()s in the file.

Libc string handling functions are broken and should not be used for
anything. It's annoying when such a large project as Git is still
repeating the same old mistakes. I guess security doesn't matter to
anyone.

Attached once again beginnings of safer string handling functions, which
should be easy to use to replace the existing string handling code. I
even thought about creating some kind of an automated tool to do this,
but that's a bit too much trouble with no gain for myself.

Usage goes like:

STATIC_STRING(str, 1024);
sstr_append(str, "hello ");
sstr_printfa(str, "%d", 5);

struct string *str;
str = str_alloc(1024); // initial malloc size, grows when needed
str_append(str, "hello ");
str_printfa(str, "%d", 5);
str_free(&str);


[-- Attachment #1.2: git-strings.diff --]
[-- Type: text/x-patch, Size: 6100 bytes --]

diff --git a/Makefile b/Makefile
index 4eb4637..c79eced 100644
--- a/Makefile
+++ b/Makefile
@@ -135,7 +135,7 @@ uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not')
 
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
-CFLAGS = -g -O2 -Wall
+CFLAGS = -g -Wall
 LDFLAGS =
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
@@ -283,7 +283,7 @@ XDIFF_LIB=xdiff/lib.a
 LIB_H = \
 	archive.h blob.h cache.h commit.h csum-file.h delta.h grep.h \
 	diff.h object.h pack.h pkt-line.h quote.h refs.h list-objects.h sideband.h \
-	run-command.h strbuf.h tag.h tree.h git-compat-util.h revision.h \
+	run-command.h strbuf.h str.h str-static.h tag.h tree.h git-compat-util.h revision.h \
 	tree-walk.h log-tree.h dir.h path-list.h unpack-trees.h builtin.h \
 	utf8.h reflog-walk.h patch-ids.h attr.h decorate.h progress.h \
 	mailmap.h remote.h
@@ -302,7 +302,7 @@ LIB_OBJS = \
 	object.o pack-check.o pack-write.o patch-delta.o path.o pkt-line.o \
 	sideband.o reachable.o reflog-walk.o \
 	quote.o read-cache.o refs.o run-command.o dir.o object-refs.o \
-	server-info.o setup.o sha1_file.o sha1_name.o strbuf.o \
+	server-info.o setup.o sha1_file.o sha1_name.o strbuf.o str.o str-static.o \
 	tag.o tree.o usage.o config.o environment.o ctype.o copy.o \
 	revision.o pager.o tree-walk.o xdiff-interface.o \
 	write_or_die.o trace.o list-objects.o grep.o match-trees.o \
diff --git a/str-static.c b/str-static.c
new file mode 100644
index 0000000..a3f48f8
--- /dev/null
+++ b/str-static.c
@@ -0,0 +1,40 @@
+#include "str-static.h"
+
+void str_static_append(struct string_static *str, const char *cstr)
+{
+	unsigned int avail = str->size - str->len;
+	unsigned int len = strlen(cstr);
+
+	if (len >= avail) {
+		len = avail - 1;
+		str->overflowed = 1;
+	}
+	memcpy(str->buf + str->len, cstr, len);
+	str->len += len;
+	str->buf[str->len] = '\0';
+}
+
+void str_static_printfa(struct string_static *str, const char *fmt, ...)
+{
+	unsigned int avail = str->size - str->len;
+	va_list va;
+	int ret;
+
+	va_start(va, fmt);
+	ret = vsnprintf(str->buf + str->len, avail, fmt, va);
+	if (ret < (int)avail)
+		str->len += ret;
+	else {
+		str->len += avail - 1;
+		str->overflowed = 1;
+	}
+	va_end(va);
+}
+
+void str_static_truncate(struct string_static *str, unsigned int len)
+{
+	if (len >= str->size)
+		len = str->size - 1;
+	str->len = len;
+	str->buf[len] = '\0';
+}
diff --git a/str-static.h b/str-static.h
new file mode 100644
index 0000000..19cb28f
--- /dev/null
+++ b/str-static.h
@@ -0,0 +1,33 @@
+#ifndef STR_STATIC_H
+#define STR_STATIC_H
+
+#include "git-compat-util.h"
+
+struct string_static {
+	unsigned int size;
+	unsigned int len:31;
+	unsigned int overflowed:1;
+	char buf[];
+};
+
+#define STR_STATIC(name, size) \
+	union { \
+	  struct string_static string; \
+	  char string_buf[sizeof(struct string_static) + (size) + 1]; \
+	} name = { { (size)+1, 0, 0 } }
+
+extern void str_static_append(struct string_static *str, const char *cstr);
+extern void str_static_printfa(struct string_static *str, const char *fmt, ...)
+	__attribute__((format (printf, 2, 3)));
+extern void str_static_truncate(struct string_static *str, unsigned int len);
+
+#define sstr_append(str, cstr) str_static_append(&(str).string, cstr)
+#define sstr_printfa(str, fmt, ...) \
+	str_static_printfa(&(str).string, fmt, __VA_ARGS__)
+#define sstr_truncate(str, len) str_static_truncate(&(str).string, len)
+
+#define sstr_c(str) ((str).string.buf)
+#define sstr_len(str) ((str).string.len)
+#define sstr_overflowed(str) ((str).string.overflowed)
+
+#endif
diff --git a/str.c b/str.c
new file mode 100644
index 0000000..7530073
--- /dev/null
+++ b/str.c
@@ -0,0 +1,75 @@
+#include "str.h"
+#include "git-compat-util.h"
+
+extern struct string *str_alloc(unsigned int initial_size)
+{
+	struct string *str;
+
+	str = xmalloc(sizeof(*str));
+	str->len = 0;
+	str->size = initial_size + 1;
+	str->buf = xmalloc(str->size);
+	str->buf[0] = '\0';
+	return str;
+}
+
+void str_free(struct string **_str)
+{
+	struct string *str = *_str;
+
+	free(str->buf);
+	str->buf = NULL;
+	free(str);
+
+	*_str = NULL;
+}
+
+static void str_grow_if_needed(struct string *str, unsigned int len)
+{
+	unsigned int avail = str->size - str->len;
+
+	if (len >= avail) {
+		str->size = (str->len + len) * 2;
+		str->buf = xrealloc(str->buf, str->size);
+	}
+}
+
+void str_append(struct string *str, const char *cstr)
+{
+	unsigned int len = strlen(cstr);
+
+	str_grow_if_needed(str, len);
+	memcpy(str->buf + str->len, cstr, len);
+	str->len += len;
+	str->buf[str->len] = '\0';
+}
+
+void str_printfa(struct string *str, const char *fmt, ...)
+{
+	unsigned int avail = str->size - str->len;
+	va_list va, va2;
+	int ret;
+
+	va_start(va, fmt);
+	va_copy(va2, va);
+	ret = vsnprintf(str->buf + str->len, avail, fmt, va);
+	assert(ret >= 0);
+
+	if ((unsigned int)ret >= avail) {
+		str_grow_if_needed(str, ret);
+		avail = str->size - str->len;
+
+		ret = vsnprintf(str->buf + str->len, avail, fmt, va2);
+		assert(ret >= 0 && (unsigned int)ret < avail);
+	}
+	str->len += ret;
+	va_end(va);
+}
+
+void str_truncate(struct string *str, unsigned int len)
+{
+	if (len >= str->size)
+		len = str->size - 1;
+	str->len = len;
+	str->buf[len] = '\0';
+}
diff --git a/str.h b/str.h
new file mode 100644
index 0000000..329aad4
--- /dev/null
+++ b/str.h
@@ -0,0 +1,20 @@
+#ifndef STR_H
+#define STR_H
+
+struct string {
+	unsigned int len, size;
+	char *buf;
+};
+
+extern struct string *str_alloc(unsigned int initial_size);
+extern void str_free(struct string **str);
+
+extern void str_append(struct string *str, const char *cstr);
+extern void str_printfa(struct string *str, const char *fmt, ...)
+	__attribute__((format (printf, 2, 3)));
+extern void str_truncate(struct string *str, unsigned int len);
+
+#define str_c(str) ((str)->buf)
+#define str_len(str) ((str)->len)
+
+#endif

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-08-30 19:26 Buffer overflows Timo Sirainen
@ 2007-08-30 20:26 ` Lukas Sandström
  2007-08-30 20:46 ` Linus Torvalds
  2007-08-30 21:48 ` [PATCH] Temporary fix for stack smashing in mailinfo Alex Riesen
  2 siblings, 0 replies; 26+ messages in thread
From: Lukas Sandström @ 2007-08-30 20:26 UTC (permalink / raw)
  To: Timo Sirainen; +Cc: git

Timo Sirainen wrote:
> Attached once again beginnings of safer string handling functions, which
> should be easy to use to replace the existing string handling code. I
> even thought about creating some kind of an automated tool to do this,
> but that's a bit too much trouble with no gain for myself.

How about using an existing string handling library instead of
creating another one from scratch?

One library worth looking at might be "The Better String Library"[1].

It claims to be both portable, stable, secure and have high performance.

/Lukas

[1] http://bstring.sourceforge.net/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-08-30 19:26 Buffer overflows Timo Sirainen
  2007-08-30 20:26 ` Lukas Sandström
@ 2007-08-30 20:46 ` Linus Torvalds
  2007-08-30 21:08   ` Timo Sirainen
  2007-08-30 21:48 ` [PATCH] Temporary fix for stack smashing in mailinfo Alex Riesen
  2 siblings, 1 reply; 26+ messages in thread
From: Linus Torvalds @ 2007-08-30 20:46 UTC (permalink / raw)
  To: Timo Sirainen; +Cc: git

On Thu, 30 Aug 2007, Timo Sirainen wrote:
>
> Looks like nothing has happened since my last mail about this
> (http://marc.info/?l=git&m=117962988804430&w=2).

Perhaps because your patch was using a totally nonstandard and slow 
interface, and had nasty string declaration issues, as people even pointed 
out to you.

If you were to send in a patch that simply just fixed some random case 
without introducing the other stuff in forms that nobody is used to, 
people would probably react more.

Especially since:

> I sure hope no-one's using git-mailinfo to do any kind of automated mail 
> processing from untrusted users.

Obviously nobody would do that. Not because of any email buffer overflows, 
but because people wouldn't want to apply untrusted patches in the first 
place!

IOW, if I don't trust the mail, I'd sure as hell not apply it - not 
because I'm afraid the email is misformed, but because I'd not trust the 
*patch*.

			Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-08-30 20:46 ` Linus Torvalds
@ 2007-08-30 21:08   ` Timo Sirainen
  2007-08-30 21:35     ` Reece Dunn
  2007-08-31  4:09     ` Linus Torvalds
  0 siblings, 2 replies; 26+ messages in thread
From: Timo Sirainen @ 2007-08-30 21:08 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: git

[-- Attachment #1: Type: text/plain, Size: 1765 bytes --]

On 30.8.2007, at 23.46, Linus Torvalds wrote:

> On Thu, 30 Aug 2007, Timo Sirainen wrote:
>>
>> Looks like nothing has happened since my last mail about this
>> (http://marc.info/?l=git&m=117962988804430&w=2).
>
> Perhaps because your patch was using a totally nonstandard and slow
> interface, and had nasty string declaration issues, as people even  
> pointed
> out to you.

Slow?

> If you were to send in a patch that simply just fixed some random case
> without introducing the other stuff in forms that nobody is used to,
> people would probably react more.

The problem is that the git code is full of these random cases. It's  
simply a huge job to even try to verify the correctness of it. Even  
if someone did that and fixed all the problems, tomorrow there would  
be new ones because noone bothers to even try to avoid them. So there  
really isn't any point in trying to make git secure until the coding  
style changes.

The code should be easy to verify to be secure, and with some kind of  
a safe string API it's a lot easier than trying to figure out corner  
cases where strcpy() calls break.

> Especially since:
>
>> I sure hope no-one's using git-mailinfo to do any kind of  
>> automated mail
>> processing from untrusted users.
>
> Obviously nobody would do that. Not because of any email buffer  
> overflows,
> but because people wouldn't want to apply untrusted patches in the  
> first
> place!

And anyone who uses git-mailinfo for anything else than manually  
applying trusted patches to their own tree deserve what they get, I  
suppose? For example if I decided to use it to automatically extract  
patches and their descriptions out of received emails and put them in  
a queue somewhere where I could look at them more easily.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 186 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-08-30 21:08   ` Timo Sirainen
@ 2007-08-30 21:35     ` Reece Dunn
  2007-08-30 21:51       ` Timo Sirainen
  2007-08-30 22:14       ` Junio C Hamano
  2007-08-31  4:09     ` Linus Torvalds
  1 sibling, 2 replies; 26+ messages in thread
From: Reece Dunn @ 2007-08-30 21:35 UTC (permalink / raw)
  To: Timo Sirainen, Linus Torvalds, git

On 30/08/2007, Timo Sirainen <tss@iki.fi> wrote:
> On 30.8.2007, at 23.46, Linus Torvalds wrote:
>
> > On Thu, 30 Aug 2007, Timo Sirainen wrote:
> >>
> >> Looks like nothing has happened since my last mail about this
> >> (http://marc.info/?l=git&m=117962988804430&w=2).
>
> > If you were to send in a patch that simply just fixed some random case
> > without introducing the other stuff in forms that nobody is used to,
> > people would probably react more.
>
> The problem is that the git code is full of these random cases. It's
> simply a huge job to even try to verify the correctness of it. Even
> if someone did that and fixed all the problems, tomorrow there would
> be new ones because noone bothers to even try to avoid them. So there
> really isn't any point in trying to make git secure until the coding
> style changes.

You don't want a manual check to do these kinds of checks. Not only is
it a huge job, you have the human factor: people make mistakes. This
is (in part) what the review process is for, but understanding how to
identify code that is safe from buffer overruns, integer overflows and
the like is a complex task. Also, it may work on 32-bit which has been
verified, but not on 64-bit.

It would be far better to specify the rules on how to detect these
issues into a static analysis tool and have that do the checking for
you. Therefore, it is possible to detect when new problems have been
added into the codebase. Does sparse support identifying these issues?

> The code should be easy to verify to be secure, and with some kind of
> a safe string API it's a lot easier than trying to figure out corner
> cases where strcpy() calls break.

Why is it easier? If you have a fixed-size buffer, why not use
strncpy, which is what a safe string API is essentially doing anyway?

In this case, detecting strcpy usage can be done via grep. This is
quick, simple and easy to repeat. Other things are more complicated,
which is where automated verification tools help.

- Reece

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH] Temporary fix for stack smashing in mailinfo
  2007-08-30 19:26 Buffer overflows Timo Sirainen
  2007-08-30 20:26 ` Lukas Sandström
  2007-08-30 20:46 ` Linus Torvalds
@ 2007-08-30 21:48 ` Alex Riesen
  2007-08-30 22:53   ` Junio C Hamano
  2 siblings, 1 reply; 26+ messages in thread
From: Alex Riesen @ 2007-08-30 21:48 UTC (permalink / raw)
  To: Timo Sirainen; +Cc: git, Junio C Hamano

Signed-off-by: Alex Riesen <raa.lkml@gmail.com>
---
Timo Sirainen, Thu, Aug 30, 2007 21:26:49 +0200:
> Attached once again beginnings of safer string handling functions, which
> should be easy to use to replace the existing string handling code. ...

Lots of code. And the bug you mentioned still not fixed.
No gain for anyone. You are right, but you are also useless.

Junio, I cannot have time to fix the code nice and proper, but as
heavy user of git-am just have to have it fixed at least a like this.
And this is ugly (and definitely incomplete), everyone be warned.

Checked with valgrind, looks good (except for iconv_open reading past
one of its arguments):

==22856== Invalid read of size 4
...
==22856==  Address 0x42D3D9C is 52 bytes inside a block of size 53 alloc'd
==22856==    at 0x4021620: malloc (vg_replace_malloc.c:149)
==22856==    by 0x41AD12F: (within /lib/tls/i686/cmov/libc-2.5.so)
==22856==    by 0x41AC56A: (within /lib/tls/i686/cmov/libc-2.5.so)
==22856==    by 0x41ACC63: (within /lib/tls/i686/cmov/libc-2.5.so)
==22856==    by 0x41A552B: (within /lib/tls/i686/cmov/libc-2.5.so)
==22856==    by 0x41A4093: (within /lib/tls/i686/cmov/libc-2.5.so)
==22856==    by 0x41A3CF9: iconv_open (in /lib/tls/i686/cmov/libc-2.5.so)
==22856==    by 0x80BEFFE: reencode_string (utf8.c:317)
==22856==    by 0x806A9D4: convert_to_utf8 (builtin-mailinfo.c:543)
==22856==    by 0x806AB8F: decode_header (builtin-mailinfo.c:600)
==22856==    by 0x806AF57: check_header (builtin-mailinfo.c:308)
==22856==    by 0x806B70B: cmd_mailinfo (builtin-mailinfo.c:936)

 builtin-mailinfo.c |   79 +++++++++++++++++++++++++++++----------------------
 1 files changed, 45 insertions(+), 34 deletions(-)

diff --git a/builtin-mailinfo.c b/builtin-mailinfo.c
index b558754..57699eb 100644
--- a/builtin-mailinfo.c
+++ b/builtin-mailinfo.c
@@ -287,12 +287,12 @@ static void cleanup_space(char *buf)
 	}
 }
 
-static void decode_header(char *it);
+static void decode_header(char *it, unsigned itsize);
 static char *header[MAX_HDR_PARSED] = {
 	"From","Subject","Date",
 };
 
-static int check_header(char *line, char **hdr_data, int overwrite)
+static int check_header(char *line, unsigned linesize, char **hdr_data, int overwrite)
 {
 	int i;
 
@@ -305,7 +305,7 @@ static int check_header(char *line, char **hdr_data, int overwrite)
 			/* Unwrap inline B and Q encoding, and optionally
 			 * normalize the meta information to utf8.
 			 */
-			decode_header(line + len + 2);
+			decode_header(line + len + 2, linesize - len - 2);
 			hdr_data[i] = xmalloc(1000 * sizeof(char));
 			if (! handle_header(line, hdr_data[i], len + 2)) {
 				return 1;
@@ -316,14 +316,14 @@ static int check_header(char *line, char **hdr_data, int overwrite)
 	/* Content stuff */
 	if (!strncasecmp(line, "Content-Type", 12) &&
 		line[12] == ':' && isspace(line[12 + 1])) {
-		decode_header(line + 12 + 2);
+		decode_header(line + 12 + 2, linesize - 12 - 2);
 		if (! handle_content_type(line)) {
 			return 1;
 		}
 	}
 	if (!strncasecmp(line, "Content-Transfer-Encoding", 25) &&
 		line[25] == ':' && isspace(line[25 + 1])) {
-		decode_header(line + 25 + 2);
+		decode_header(line + 25 + 2, linesize - 25 - 2);
 		if (! handle_content_transfer_encoding(line)) {
 			return 1;
 		}
@@ -432,10 +432,15 @@ static int read_one_header_line(char *line, int sz, FILE *in)
 	return 1;
 }
 
-static int decode_q_segment(char *in, char *ot, char *ep, int rfc2047)
+static int decode_q_segment(char *in, char *ot, unsigned otsize, char *ep, int rfc2047)
 {
+	char *otend = ot + otsize;
 	int c;
 	while ((c = *in++) != 0 && (in <= ep)) {
+		if (ot == otend) {
+			*--ot = '\0';
+			return -1;
+		}
 		if (c == '=') {
 			int d = *in++;
 			if (d == '\n' || !d)
@@ -451,12 +456,17 @@ static int decode_q_segment(char *in, char *ot, char *ep, int rfc2047)
 	return 0;
 }
 
-static int decode_b_segment(char *in, char *ot, char *ep)
+static int decode_b_segment(char *in, char *ot, unsigned otsize, char *ep)
 {
 	/* Decode in..ep, possibly in-place to ot */
 	int c, pos = 0, acc = 0;
+	char *otend = ot + otsize;
 
 	while ((c = *in++) != 0 && (in <= ep)) {
+		if (ot == otend) {
+			*--ot = '\0';
+			return -1;
+		}
 		if (c == '+')
 			c = 62;
 		else if (c == '/')
@@ -518,7 +528,7 @@ static const char *guess_charset(const char *line, const char *target_charset)
 	return "latin1";
 }
 
-static void convert_to_utf8(char *line, const char *charset)
+static void convert_to_utf8(char *line, unsigned linesize, const char *charset)
 {
 	char *out;
 
@@ -534,11 +544,11 @@ static void convert_to_utf8(char *line, const char *charset)
 	if (!out)
 		die("cannot convert from %s to %s\n",
 		    charset, metainfo_charset);
-	strcpy(line, out);
+	strlcpy(line, out, linesize);
 	free(out);
 }
 
-static int decode_header_bq(char *it)
+static int decode_header_bq(char *it, unsigned itsize)
 {
 	char *in, *out, *ep, *cp, *sp;
 	char outbuf[1000];
@@ -578,56 +588,56 @@ static int decode_header_bq(char *it)
 		default:
 			return rfc2047; /* no munging */
 		case 'b':
-			sz = decode_b_segment(cp + 3, piecebuf, ep);
+			sz = decode_b_segment(cp + 3, piecebuf, sizeof(piecebuf), ep);
 			break;
 		case 'q':
-			sz = decode_q_segment(cp + 3, piecebuf, ep, 1);
+			sz = decode_q_segment(cp + 3, piecebuf, sizeof(piecebuf), ep, 1);
 			break;
 		}
 		if (sz < 0)
 			return rfc2047;
 		if (metainfo_charset)
-			convert_to_utf8(piecebuf, charset_q);
+			convert_to_utf8(piecebuf, sizeof(piecebuf), charset_q);
 		strcpy(out, piecebuf);
 		out += strlen(out);
 		in = ep + 2;
 	}
 	strcpy(out, in);
-	strcpy(it, outbuf);
+	strlcpy(it, outbuf, itsize);
 	return rfc2047;
 }
 
-static void decode_header(char *it)
+static void decode_header(char *it, unsigned itsize)
 {
 
-	if (decode_header_bq(it))
+	if (decode_header_bq(it, itsize))
 		return;
 	/* otherwise "it" is a straight copy of the input.
 	 * This can be binary guck but there is no charset specified.
 	 */
 	if (metainfo_charset)
-		convert_to_utf8(it, "");
+		convert_to_utf8(it, itsize, "");
 }
 
-static void decode_transfer_encoding(char *line)
+static void decode_transfer_encoding(char *line, unsigned linesize)
 {
 	char *ep;
 
 	switch (transfer_encoding) {
 	case TE_QP:
 		ep = line + strlen(line);
-		decode_q_segment(line, line, ep, 0);
+		decode_q_segment(line, line, linesize, ep, 0);
 		break;
 	case TE_BASE64:
 		ep = line + strlen(line);
-		decode_b_segment(line, line, ep);
+		decode_b_segment(line, line, linesize, ep);
 		break;
 	case TE_DONTCARE:
 		break;
 	}
 }
 
-static int handle_filter(char *line);
+static int handle_filter(char *line, unsigned linesize);
 
 static int find_boundary(void)
 {
@@ -655,7 +665,7 @@ again:
 					"can't recover\n");
 			exit(1);
 		}
-		handle_filter(newline);
+		handle_filter(newline, sizeof(newline));
 
 		/* skip to the next boundary */
 		if (!find_boundary())
@@ -670,7 +680,7 @@ again:
 
 	/* slurp in this section's info */
 	while (read_one_header_line(line, sizeof(line), fin))
-		check_header(line, p_hdr_data, 0);
+		check_header(line, sizeof(line), p_hdr_data, 0);
 
 	/* eat the blank line after section info */
 	return (fgets(line, sizeof(line), fin) != NULL);
@@ -709,9 +719,10 @@ static inline int patchbreak(const char *line)
 }
 
 
-static int handle_commit_msg(char *line)
+static int handle_commit_msg(char *line, unsigned linesize)
 {
 	static int still_looking = 1;
+	char *endline = line + linesize;
 
 	if (!cmitmsg)
 		return 0;
@@ -726,13 +737,13 @@ static int handle_commit_msg(char *line)
 			if (!*cp)
 				return 0;
 		}
-		if ((still_looking = check_header(cp, s_hdr_data, 0)) != 0)
+		if ((still_looking = check_header(cp, endline - cp, s_hdr_data, 0)) != 0)
 			return 0;
 	}
 
 	/* normalize the log message to UTF-8. */
 	if (metainfo_charset)
-		convert_to_utf8(line, charset);
+		convert_to_utf8(line, endline - line, charset);
 
 	if (patchbreak(line)) {
 		fclose(cmitmsg);
@@ -751,7 +762,7 @@ static int handle_patch(char *line)
 	return 0;
 }
 
-static int handle_filter(char *line)
+static int handle_filter(char *line, unsigned linesize)
 {
 	static int filter = 0;
 
@@ -760,7 +771,7 @@ static int handle_filter(char *line)
 	 */
 	switch (filter) {
 	case 0:
-		if (!handle_commit_msg(line))
+		if (!handle_commit_msg(line, linesize))
 			break;
 		filter++;
 	case 1:
@@ -792,14 +803,14 @@ static void handle_body(void)
 			/* flush any leftover */
 			if ((transfer_encoding == TE_BASE64)  &&
 			    (np != newline)) {
-				handle_filter(newline);
+				handle_filter(newline, sizeof(newline));
 			}
 			if (!handle_boundary())
 				return;
 		}
 
 		/* Unwrap transfer encoding */
-		decode_transfer_encoding(line);
+		decode_transfer_encoding(line, sizeof(line));
 
 		switch (transfer_encoding) {
 		case TE_BASE64:
@@ -808,7 +819,7 @@ static void handle_body(void)
 
 			/* binary data most likely doesn't have newlines */
 			if (message_type != TYPE_TEXT) {
-				rc = handle_filter(line);
+				rc = handle_filter(line, sizeof(newline));
 				break;
 			}
 
@@ -825,7 +836,7 @@ static void handle_body(void)
 					/* should be sitting on a new line */
 					*(++np) = 0;
 					op++;
-					rc = handle_filter(newline);
+					rc = handle_filter(newline, sizeof(newline));
 					np = newline;
 				}
 			} while (*op != 0);
@@ -835,7 +846,7 @@ static void handle_body(void)
 			break;
 		}
 		default:
-			rc = handle_filter(line);
+			rc = handle_filter(line, sizeof(newline));
 		}
 		if (rc)
 			/* nothing left to filter */
@@ -922,7 +933,7 @@ static int mailinfo(FILE *in, FILE *out, int ks, const char *encoding,
 
 	/* process the email header */
 	while (read_one_header_line(line, sizeof(line), fin))
-		check_header(line, p_hdr_data, 1);
+		check_header(line, sizeof(line), p_hdr_data, 1);
 
 	handle_body();
 	handle_info();
-- 
1.5.3.rc7.26.g5f7e4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-08-30 21:35     ` Reece Dunn
@ 2007-08-30 21:51       ` Timo Sirainen
  2007-08-30 22:34         ` Reece Dunn
  2007-08-30 22:14       ` Junio C Hamano
  1 sibling, 1 reply; 26+ messages in thread
From: Timo Sirainen @ 2007-08-30 21:51 UTC (permalink / raw)
  To: Reece Dunn; +Cc: Linus Torvalds, git

[-- Attachment #1: Type: text/plain, Size: 2934 bytes --]

On 31.8.2007, at 0.35, Reece Dunn wrote:

>> The problem is that the git code is full of these random cases. It's
>> simply a huge job to even try to verify the correctness of it. Even
>> if someone did that and fixed all the problems, tomorrow there would
>> be new ones because noone bothers to even try to avoid them. So there
>> really isn't any point in trying to make git secure until the coding
>> style changes.
>
> You don't want a manual check to do these kinds of checks. Not only is
> it a huge job, you have the human factor: people make mistakes. This
> is (in part) what the review process is for, but understanding how to
> identify code that is safe from buffer overruns, integer overflows and
> the like is a complex task. Also, it may work on 32-bit which has been
> verified, but not on 64-bit.
>
> It would be far better to specify the rules on how to detect these
> issues into a static analysis tool and have that do the checking for
> you. Therefore, it is possible to detect when new problems have been
> added into the codebase. Does sparse support identifying these issues?

Yes, it is a complex task. But if there did exist such a static  
analyzer tool already, it would probably show that half of the strcpy 
() calls (and others) in git are currently unsafe. Wouldn't help all  
that much I think.

>> The code should be easy to verify to be secure, and with some kind of
>> a safe string API it's a lot easier than trying to figure out corner
>> cases where strcpy() calls break.
>
> Why is it easier? If you have a fixed-size buffer, why not use
> strncpy, which is what a safe string API is essentially doing anyway?

Well, strncpy() is a pretty good example actually. A lot of people  
use it wrong, because they don't realize that it doesn't necessarily  
NUL-terminate the strings. So it's another example of a bad API that  
can be easily used wrong. And besides that, it also fills the rest of  
the buffer with NULs, which is almost always pointless waste of CPU.

And why is safe string API easier to verify? Here's an example:

// see how easily you can use strncpy() to cause a buffer overflow:
char buf[1024];
strncpy(buf, input, 2048);

// see how impossible it is to cause a buffer overflow with my static  
string API:
STR_STATIC(str, 1024);
sstr_append(str, input);

Of course the above example is a simple one, but often when using  
libc string handling functions for building strings the code gets  
complex and there are all kinds of "is the buffer full already? what  
about now? and now? and now?" and with all of those checks it's easy  
to make mistakes.

The point is that if the APIs are (nearly) impossible to use  
insecurely, it's very easy to verify that the code is safe. The code  
doesn't get safe by lots of checks everywhere, it gets safe by  
placing a minimal amount of checks to small area of the code. The  
correctness of a few checks is a lot easier to verify.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 186 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-08-30 21:35     ` Reece Dunn
  2007-08-30 21:51       ` Timo Sirainen
@ 2007-08-30 22:14       ` Junio C Hamano
  2007-08-30 22:36         ` Pierre Habouzit
                           ` (2 more replies)
  1 sibling, 3 replies; 26+ messages in thread
From: Junio C Hamano @ 2007-08-30 22:14 UTC (permalink / raw)
  To: Reece Dunn; +Cc: Timo Sirainen, Linus Torvalds, git

"Reece Dunn" <msclrhd@googlemail.com> writes:

> Why is it easier? If you have a fixed-size buffer, why not use
> strncpy, which is what a safe string API is essentially doing anyway?

I would not claim unchecked strcpy is good -- we obviously would
want to fix them.

But at the same time use of strncpy, strlcpy and friends solves
only half of the problem.  Often people say "use strncpy or
strlcpy then you would not overstep the buffer", but that does
not really solve anything, without additional logic to deal with
resulting truncation (barfing with "insanely long string" error
message and dying is the least impact).  Continuing the work on
data that the user did not intend to give you is just as wrong
as using corrupt data that overflowed your static buffer.

Does Timo's nonstandard API solve that issue?  Perhaps it does,
perhaps not.  Does it make easier to maintain our code?  I
highly doubt it in the current shape.

It is well and widely understood idiom to use strlcpy to a
fixed-sized buffer and checking the resulting length to make
sure the result would not have overflowed (and if it would have,
issue an error and die).  I would not have anything against a
set of patches to follow such a pattern.

But a patch to add a non-standard API that nobody else uses,
without any patch to show the changes to a few places that could
use the API to demonstrate that the use of API vastly cleans the
code up and makes it infinitely harder to make mistakes?

The API needs to justify itself to convince the people who needs
to learn and adjust to that the benefit far outweighes deviation
from better known patterns, and I do not see that happening in
Timo's patch.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-08-30 21:51       ` Timo Sirainen
@ 2007-08-30 22:34         ` Reece Dunn
  2007-08-31 10:52           ` Wincent Colaiuta
  0 siblings, 1 reply; 26+ messages in thread
From: Reece Dunn @ 2007-08-30 22:34 UTC (permalink / raw)
  To: Timo Sirainen, Linus Torvalds, git

On 30/08/2007, Timo Sirainen <tss@iki.fi> wrote:
> On 31.8.2007, at 0.35, Reece Dunn wrote:
>
> >> The problem is that the git code is full of these random cases. It's
> >> simply a huge job to even try to verify the correctness of it. Even
> >> if someone did that and fixed all the problems, tomorrow there would
> >> be new ones because noone bothers to even try to avoid them. So there
> >> really isn't any point in trying to make git secure until the coding
> >> style changes.
> >
> > You don't want a manual check to do these kinds of checks. Not only is
> > it a huge job, you have the human factor: people make mistakes. This
> > is (in part) what the review process is for, but understanding how to
> > identify code that is safe from buffer overruns, integer overflows and
> > the like is a complex task. Also, it may work on 32-bit which has been
> > verified, but not on 64-bit.
> >
> > It would be far better to specify the rules on how to detect these
> > issues into a static analysis tool and have that do the checking for
> > you. Therefore, it is possible to detect when new problems have been
> > added into the codebase. Does sparse support identifying these issues?
>
> Yes, it is a complex task. But if there did exist such a static
> analyzer tool already, it would probably show that half of the strcpy
> () calls (and others) in git are currently unsafe. Wouldn't help all
> that much I think.

Let's see:

symlinks.c(39): In has_symlink_leading_path, the last_symlink input
argument needs to be large enough to hold MAX_PATH characters, as this
is sizeof(path). This is a potential risk, depending on how this
method is used.

sideband.c(17): buf is large enough to copy the "remote:" string
literal into. Also, the buffer is large enough for the input being
read, as packet_read_line specifies the size of the remaining buffer
with an extra character reserved for the null. Therefore, this usage
is safe.

sha1_file.c(550): open_pack_index does a runtime calculation that
essentially results in strcpy( ".pack", ".idx" ). As these are
literals, this usage is safe, provided that strlen(idx_name) >
strlen(".pack") which should be ensured by the caller. Hence, this
usage is safe.

So far, these are looking safe to me. I'd need to check the remaining
uses to be sure and to find the strcpy usage that you have identified
as an issue.

It would also be worth creating a test case that triggers this bug, as
that can then be used to ensure that it does not reappear once fixed.
That is the best way to deal with these, as not all uses of "unsafe"
API are actually unsafe.

> >> The code should be easy to verify to be secure, and with some kind of
> >> a safe string API it's a lot easier than trying to figure out corner
> >> cases where strcpy() calls break.
> >
> > Why is it easier? If you have a fixed-size buffer, why not use
> > strncpy, which is what a safe string API is essentially doing anyway?
>
> Well, strncpy() is a pretty good example actually. A lot of people
> use it wrong, because they don't realize that it doesn't necessarily
> NUL-terminate the strings. So it's another example of a bad API that
> can be easily used wrong. And besides that, it also fills the rest of
> the buffer with NULs, which is almost always pointless waste of CPU.

Ok, I take your point. However, there is nothing preventing these API
from being used correctly. Granted, this is more work and like the
strncpy example, have subtle behaviour, but that does not make it
impossible.

As an example, do your safe API do null pointer checks. This is
because strcpy, strlen and the like don't, which is one of the reasons
why they are considered unsafe. But then, if you guarantee that you
are not passing a null pointer to one of these API, why take the hit
of the additional checks when you know that these are safe. It is also
easier to debug the problem if it happens at the point of call,
instead of silently working.

One of the problems with a safe string API is that there are different
possible things to do when an overrun is detected. The best thing to
do would be to cap the input to the buffer size and return some error
status. For example, you could have:

    STR_STATIC(str, 1);
    sstr_append(str, "/foo/bar");
    // run "rm -rf ${str}"

which would hose your system. This is worse than a buffer overrun.

> And why is safe string API easier to verify? Here's an example:
>
> // see how easily you can use strncpy() to cause a buffer overflow:
> char buf[1024];
> strncpy(buf, input, 2048);

Yes it can, as above. A subtler case would be when the space for a
null terminator is missed. But then I'd write something like:

    char buf[1024];
    strncpy(buf, input, sizeof(buf));

Here, the compiler does the work for me, so if I change 1024 to 512,
it will still be safe.

> // see how impossible it is to cause a buffer overflow with my static
> string API:
> STR_STATIC(str, 1024);
> sstr_append(str, input);

How is str defined? Looking at the code, I can't _see_ that this is
safe. All I see as inputs are `str` and `input`, so how can I verify
that this is correct without looking at how these are defined.

> Of course the above example is a simple one, but often when using
> libc string handling functions for building strings the code gets
> complex and there are all kinds of "is the buffer full already? what
> about now? and now? and now?" and with all of those checks it's easy
> to make mistakes.

But also, those API are well known, along with their associated
problems. By creating a new API, there are new (unknown) ways to
abuse/misuse it.

> The point is that if the APIs are (nearly) impossible to use
> insecurely, it's very easy to verify that the code is safe. The code
> doesn't get safe by lots of checks everywhere, it gets safe by
> placing a minimal amount of checks to small area of the code. The
> correctness of a few checks is a lot easier to verify.

But what about that nearly impossible case?

Also, how do you use the safe string API with other API that don't
have a "safe" equivalent?

- Reece

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-08-30 22:14       ` Junio C Hamano
@ 2007-08-30 22:36         ` Pierre Habouzit
  2007-08-30 22:41         ` Timo Sirainen
  2007-09-02 13:42         ` Johan Herland
  2 siblings, 0 replies; 26+ messages in thread
From: Pierre Habouzit @ 2007-08-30 22:36 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Reece Dunn, Timo Sirainen, Linus Torvalds, git

[-- Attachment #1: Type: text/plain, Size: 981 bytes --]

On Thu, Aug 30, 2007 at 10:14:04PM +0000, Junio C Hamano wrote:
> "Reece Dunn" <msclrhd@googlemail.com> writes:
> 
> > Why is it easier? If you have a fixed-size buffer, why not use
> > strncpy, which is what a safe string API is essentially doing anyway?
> 
> I would not claim unchecked strcpy is good -- we obviously would
> want to fix them.
> 
> But at the same time use of strncpy, strlcpy and friends solves
> only half of the problem.

  Actually, strncpy solves nothing as it's completely broken in so many
ways: it does not necessarily ends the string with a NUL-char, and it
NUL-pads the buffer, making it really slow when you use it top copy 10
chars in a BUFSIZ-big buffer.

  strncpy should never ever be used, few programmers understand it, and
it's very error prone.

-- 
·O·  Pierre Habouzit
··O                                                madcoder@debian.org
OOO                                                http://www.madism.org

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-08-30 22:14       ` Junio C Hamano
  2007-08-30 22:36         ` Pierre Habouzit
@ 2007-08-30 22:41         ` Timo Sirainen
  2007-09-02 13:42         ` Johan Herland
  2 siblings, 0 replies; 26+ messages in thread
From: Timo Sirainen @ 2007-08-30 22:41 UTC (permalink / raw)
  To: Junio C Hamano; +Cc: Reece Dunn, Linus Torvalds, git

[-- Attachment #1: Type: text/plain, Size: 3356 bytes --]

On 31.8.2007, at 1.14, Junio C Hamano wrote:

> "Reece Dunn" <msclrhd@googlemail.com> writes:
>
>> Why is it easier? If you have a fixed-size buffer, why not use
>> strncpy, which is what a safe string API is essentially doing anyway?
>
> I would not claim unchecked strcpy is good -- we obviously would
> want to fix them.
>
> But at the same time use of strncpy, strlcpy and friends solves
> only half of the problem.  Often people say "use strncpy or
> strlcpy then you would not overstep the buffer", but that does
> not really solve anything, without additional logic to deal with
> resulting truncation (barfing with "insanely long string" error
> message and dying is the least impact).  Continuing the work on
> data that the user did not intend to give you is just as wrong
> as using corrupt data that overflowed your static buffer.

Sure, I agree with that. In my own code I avoid static buffer sizes  
as much as I can. But since git's code is far from even trying to do  
that, I thought it would be easier to start with a simpler plan: Try  
not to have remote code execution holes that can be found with 2  
minutes of looking at the code.

> Does Timo's nonstandard API solve that issue?  Perhaps it does,
> perhaps not.

That would require moving to dynamically growing strings, which in  
turn requires freeing the strings afterwards so it's not such a  
simple job. Simply replacing the current char[] buffers with my  
static_string would be quick and easy and although it wouldn't fix  
all potential problems, it would fix most of the buffer overflows.

> Does it make easier to maintain our code?  I highly doubt it in the  
> current shape.

Depends on what you mean by "maintain". I find the resulting code a  
lot easier to understand, and a lot easier to verify for correctness  
and safety. Here's an example: http://marc.info/? 
l=git&m=117962988914013&w=2

But then again you and Alex didn't seem to think so.

> It is well and widely understood idiom to use strlcpy to a
> fixed-sized buffer and checking the resulting length to make
> sure the result would not have overflowed (and if it would have,
> issue an error and die).  I would not have anything against a
> set of patches to follow such a pattern.

I don't like strlcpy()/strlcat() all that much either, because  
checking the overflow is more difficult than it needs to be. For  
example:

if (strlcpy(dest, src, sizeof(dest)) <= sizeof(dest)) overflow();
// compared to a function that simply returns if it overflowed or not:
if (strocpy(dest, src, sizeof(dest)) < 0) overflow();

Actually I'm not even sure if the above strlcpy() check is right. Is  
it <= or <?

> The API needs to justify itself to convince the people who needs
> to learn and adjust to that the benefit far outweighes deviation
> from better known patterns, and I do not see that happening in
> Timo's patch.

The better known patterns are being used insecurely all the time now,  
so I can't really see how this would be anything worse.

Anyway my point wasn't to get my code into git. I just wanted that  
*something* would be done about this. Currently I just can't see  
myself wanting to use git, because it limits what I can do with it.  
Any kind of automated processing is completely out of the question  
because then attackers could easily take over my machine if they  
wanted to.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 186 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] Temporary fix for stack smashing in mailinfo
  2007-08-30 21:48 ` [PATCH] Temporary fix for stack smashing in mailinfo Alex Riesen
@ 2007-08-30 22:53   ` Junio C Hamano
  0 siblings, 0 replies; 26+ messages in thread
From: Junio C Hamano @ 2007-08-30 22:53 UTC (permalink / raw)
  To: Alex Riesen; +Cc: Timo Sirainen, git

Alex Riesen <raa.lkml@gmail.com> writes:

> Junio, I cannot have time to fix the code nice and proper, but as
> heavy user of git-am just have to have it fixed at least a like this.
> And this is ugly (and definitely incomplete), everyone be warned.
>
> Checked with valgrind, looks good (except for iconv_open reading past
> one of its arguments):

On the top of your patch, I think decode_header_bq() needs to
make sure that a string with more than one pieces, each of which
decodes well within piecebuf, cannot overflow outbuf[] in the
while loop.

> @@ -578,56 +588,56 @@ static int decode_header_bq(char *it)
>  		default:
>  			return rfc2047; /* no munging */
>  		case 'b':
> -			sz = decode_b_segment(cp + 3, piecebuf, ep);
> +			sz = decode_b_segment(cp + 3, piecebuf, sizeof(piecebuf), ep);
>  			break;
>  		case 'q':
> -			sz = decode_q_segment(cp + 3, piecebuf, ep, 1);
> +			sz = decode_q_segment(cp + 3, piecebuf, sizeof(piecebuf), ep, 1);
>  			break;
>  		}
>  		if (sz < 0)
>  			return rfc2047;
>  		if (metainfo_charset)
> -			convert_to_utf8(piecebuf, charset_q);
> +			convert_to_utf8(piecebuf, sizeof(piecebuf), charset_q);
>  		strcpy(out, piecebuf);
>  		out += strlen(out);
>  		in = ep + 2;
>  	}

It might also make sense to redo the lower level decoding
functions using existing strbuf interface to build string
without pre-set bounds.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-08-30 21:08   ` Timo Sirainen
  2007-08-30 21:35     ` Reece Dunn
@ 2007-08-31  4:09     ` Linus Torvalds
  2007-08-31  5:00       ` Timo Sirainen
  1 sibling, 1 reply; 26+ messages in thread
From: Linus Torvalds @ 2007-08-31  4:09 UTC (permalink / raw)
  To: Timo Sirainen; +Cc: git

On Fri, 31 Aug 2007, Timo Sirainen wrote:
> > 
> > Perhaps because your patch was using a totally nonstandard and slow
> > interface, and had nasty string declaration issues, as people even pointed
> > out to you.
> 
> Slow?

Having a string library, and then implementing "str_append()" with a 
strlen() sounds pretty disgusting to me. 

Gcc could have optimized the strlen() away for constant string arguments, 
but since you made the thing out-of-line, it can't do that any more.

So yes, I bet there are faster string libraries out there.

> The code should be easy to verify to be secure, and with some kind of a safe
> string API it's a lot easier than trying to figure out corner cases where
> strcpy() calls break.

I actually looked at the patches, and the "stringbuf()" thing was just too 
ugly to live. It was also nonportable, in that you use the reserved 
namespace (which we do extensively in the kernel, but that's an 
"embdedded" application that doesn't use system header files).

So the API was anything but "safe".

I think something like that could work, but it really should be done 
right, or not at all.

		Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-08-31  4:09     ` Linus Torvalds
@ 2007-08-31  5:00       ` Timo Sirainen
  2007-08-31  9:53         ` Andreas Ericsson
  2007-08-31 10:06         ` Johannes Schindelin
  0 siblings, 2 replies; 26+ messages in thread
From: Timo Sirainen @ 2007-08-31  5:00 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: git


[-- Attachment #1.1: Type: text/plain, Size: 1889 bytes --]

On Thu, 2007-08-30 at 21:09 -0700, Linus Torvalds wrote:
> 
> On Fri, 31 Aug 2007, Timo Sirainen wrote:
> > > 
> > > Perhaps because your patch was using a totally nonstandard and slow
> > > interface, and had nasty string declaration issues, as people even pointed
> > > out to you.
> > 
> > Slow?
> 
> Having a string library, and then implementing "str_append()" with a 
> strlen() sounds pretty disgusting to me. 
> 
> Gcc could have optimized the strlen() away for constant string arguments, 
> but since you made the thing out-of-line, it can't do that any more.
> 
> So yes, I bet there are faster string libraries out there.

Oh, well that's easy to fix. But I don't think the speed matters much in
string manipulation, it's usually not done in performance critical
paths.

> > The code should be easy to verify to be secure, and with some kind of a safe
> > string API it's a lot easier than trying to figure out corner cases where
> > strcpy() calls break.
> 
> I actually looked at the patches, and the "stringbuf()" thing was just too 
> ugly to live. It was also nonportable, in that you use the reserved 
> namespace (which we do extensively in the kernel, but that's an 
> "embdedded" application that doesn't use system header files).
> 
> So the API was anything but "safe".

Some of those were fixed in the patch I sent at the beginning of this
thread. But since you asked, attached is yet another version of it. As
far as I know it's fully C99 compatible. Would be C89 but there's that
va_copy() call.

I guess it could still be optimized more, but at least strlen() is now
in a macro. :)

Or if you don't like the STR_STATIC() macro, another way would be:

#define STR_STATIC_INIT(buf) { buf, sizeof(buf), 0, }

char static_buf[1024];
struct string str = STR_STATIC_INIT(static_buf);

str_append(&str, "hello world");


[-- Attachment #1.2: git-strings.diff --]
[-- Type: text/x-patch, Size: 3063 bytes --]

--- /dev/null	2007-07-12 03:40:41.165212167 +0300
+++ str.h	2007-08-31 07:35:04.000000000 +0300
@@ -0,0 +1,33 @@
+#ifndef STR_H
+#define STR_H
+
+struct string {
+	char *buf;
+
+	unsigned int alloc_size;
+	unsigned int len;
+
+	unsigned int malloced:1;
+	unsigned int overflowed:1;
+};
+
+#define STR_STATIC(name, size) \
+	struct { \
+	  struct string str; \
+	  char string_buf[(size) + 1]; \
+	} name = { { (name).string_buf, sizeof((name).string_buf), 0, } }
+
+extern struct string *str_alloc(unsigned int initial_size);
+extern void str_free(struct string **str);
+
+extern void str_append_n(struct string *str, const char *cstr,
+			 unsigned int len);
+extern void str_printfa(struct string *str, const char *fmt, ...)
+	__attribute__((format (printf, 2, 3)));
+extern void str_truncate(struct string *str, unsigned int len);
+
+#define str_append(str, cstr) str_append_n(str, cstr, strlen(cstr))
+#define str_c(str) ((str)->buf)
+#define str_len(str) ((str)->len)
+
+#endif
--- /dev/null	2007-07-12 03:40:41.165212167 +0300
+++ str.c	2007-08-31 07:36:56.000000000 +0300
@@ -0,0 +1,85 @@
+#include "str.h"
+#include "git-compat-util.h"
+
+struct string *str_alloc(unsigned int initial_size)
+{
+	struct string *str;
+
+	str = xcalloc(sizeof(*str), 1);
+	str->alloc_size = initial_size + 1;
+	str->buf = xmalloc(str->alloc_size);
+	str->buf[0] = '\0';
+	return str;
+}
+
+void str_free(struct string **_str)
+{
+	struct string *str = *_str;
+
+	if (str->malloced)
+		free(str->buf);
+	str->buf = NULL;
+	free(str);
+
+	*_str = NULL;
+}
+
+static void str_grow_if_needed(struct string *str, unsigned int *len)
+{
+	unsigned int avail = str->alloc_size - str->len;
+
+	if (*len >= avail) {
+		if (!str->malloced) {
+			/* static buffer size */
+			*len = avail;
+			str->overflowed = 1;
+		} else {
+			str->alloc_size = (str->len + *len) * 2;
+			str->buf = xrealloc(str->buf, str->alloc_size);
+		}
+	}
+}
+
+void str_append_n(struct string *str, const char *cstr, unsigned int len)
+{
+	str_grow_if_needed(str, &len);
+	memcpy(str->buf + str->len, cstr, len);
+	str->len += len;
+	str->buf[str->len] = '\0';
+}
+
+void str_printfa(struct string *str, const char *fmt, ...)
+{
+	unsigned int len, avail = str->alloc_size - str->len;
+	va_list va, va2;
+	int ret;
+
+	va_start(va, fmt);
+	va_copy(va2, va);
+	ret = vsnprintf(str->buf + str->len, avail, fmt, va);
+	assert(ret >= 0);
+
+	if ((unsigned int)ret >= avail) {
+		if (!str->malloced) {
+			str->overflowed = 1;
+			ret = avail == 0 ? 0 : avail-1;
+		} else {
+			len = ret;
+			str_grow_if_needed(str, &len);
+			avail = str->alloc_size - str->len;
+
+			ret = vsnprintf(str->buf + str->len, avail, fmt, va2);
+			assert(ret >= 0 && (unsigned int)ret < avail);
+		}
+	}
+	str->len += ret;
+	va_end(va);
+}
+
+void str_truncate(struct string *str, unsigned int len)
+{
+	if (len >= str->alloc_size)
+		len = str->alloc_size - 1;
+	str->len = len;
+	str->buf[len] = '\0';
+}

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-08-31  5:00       ` Timo Sirainen
@ 2007-08-31  9:53         ` Andreas Ericsson
  2007-08-31 10:06         ` Johannes Schindelin
  1 sibling, 0 replies; 26+ messages in thread
From: Andreas Ericsson @ 2007-08-31  9:53 UTC (permalink / raw)
  To: Timo Sirainen; +Cc: Linus Torvalds, git

Timo Sirainen wrote:
> On Thu, 2007-08-30 at 21:09 -0700, Linus Torvalds wrote:
>> On Fri, 31 Aug 2007, Timo Sirainen wrote:
>>>> Perhaps because your patch was using a totally nonstandard and slow
>>>> interface, and had nasty string declaration issues, as people even pointed
>>>> out to you.
>>> Slow?
>> Having a string library, and then implementing "str_append()" with a 
>> strlen() sounds pretty disgusting to me. 
>>
>> Gcc could have optimized the strlen() away for constant string arguments, 
>> but since you made the thing out-of-line, it can't do that any more.
>>
>> So yes, I bet there are faster string libraries out there.
> 
> Oh, well that's easy to fix. But I don't think the speed matters much in
> string manipulation, it's usually not done in performance critical
> paths.
> 

In git it is, as strings are manipulated en masse in order to traverse
history, look up paths from index/filesystem etc, etc.

-- 
Andreas Ericsson                   andreas.ericsson@op5.se
OP5 AB                             www.op5.se
Tel: +46 8-230225                  Fax: +46 8-230231

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-08-31  5:00       ` Timo Sirainen
  2007-08-31  9:53         ` Andreas Ericsson
@ 2007-08-31 10:06         ` Johannes Schindelin
  1 sibling, 0 replies; 26+ messages in thread
From: Johannes Schindelin @ 2007-08-31 10:06 UTC (permalink / raw)
  To: Timo Sirainen; +Cc: Linus Torvalds, git

Hi,

On Fri, 31 Aug 2007, Timo Sirainen wrote:

> On Thu, 2007-08-30 at 21:09 -0700, Linus Torvalds wrote:
> 
> > So yes, I bet there are faster string libraries out there.
> 
> Oh, well that's easy to fix. But I don't think the speed matters much in 
> string manipulation, it's usually not done in performance critical 
> paths.

AFAIR one of the recent performance studies showed strlen() as one of the 
_biggest_ offenders.  So no, your point has been disproven _already_.

I have to wonder, though, why you do not just go and enhance strbuf.[ch], 
which is nice and easy, and not the least unelegant.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-08-30 22:34         ` Reece Dunn
@ 2007-08-31 10:52           ` Wincent Colaiuta
  2007-08-31 12:48             ` Simon 'corecode' Schubert
  0 siblings, 1 reply; 26+ messages in thread
From: Wincent Colaiuta @ 2007-08-31 10:52 UTC (permalink / raw)
  To: Reece Dunn; +Cc: Timo Sirainen, Linus Torvalds, git

El 31/8/2007, a las 0:34, Reece Dunn escribió:

> As an example, do your safe API do null pointer checks. This is
> because strcpy, strlen and the like don't, which is one of the reasons
> why they are considered unsafe. But then, if you guarantee that you
> are not passing a null pointer to one of these API, why take the hit
> of the additional checks when you know that these are safe.

Do you really think that comparing a pointer to NULL is going to be a  
speed hit? I would imagine that on most architectures it boils down  
to one or two machine code instructions.

Cheers,
Wincent

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-08-31 10:52           ` Wincent Colaiuta
@ 2007-08-31 12:48             ` Simon 'corecode' Schubert
  0 siblings, 0 replies; 26+ messages in thread
From: Simon 'corecode' Schubert @ 2007-08-31 12:48 UTC (permalink / raw)
  To: Wincent Colaiuta; +Cc: Reece Dunn, Timo Sirainen, Linus Torvalds, git

Wincent Colaiuta wrote:
>> As an example, do your safe API do null pointer checks. This is
>> because strcpy, strlen and the like don't, which is one of the reasons
>> why they are considered unsafe. But then, if you guarantee that you
>> are not passing a null pointer to one of these API, why take the hit
>> of the additional checks when you know that these are safe.
> Do you really think that comparing a pointer to NULL is going to be a 
> speed hit? I would imagine that on most architectures it boils down to 
> one or two machine code instructions.

The question rather is, why should you bother comparing to a NULL pointer?  To return an error (EINVAL?)?  I'd rather have either a) the caller check or b) the process segfault.  A segfault gives me a nice core file which I can use to hunt the bug.

I also don't see why not checking for NULL pointers is unsafe.  Okay, maybe there are platforms out there which do not crash on a NULL pointer derefence, but I doubt these are consumers of git.  All other platforms are safe by the implicit check of the MMU.

The worst thing is something like

if (ptr == NULL)
	abort();

which only adds code (and thus needs maintenance), but no value whatsoever.  Either the following code tolerates NULL pointers or it will crash and segfault, so why bother panicing before.

Of course I might be totally of track...

cheers
  simon

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-08-30 22:14       ` Junio C Hamano
  2007-08-30 22:36         ` Pierre Habouzit
  2007-08-30 22:41         ` Timo Sirainen
@ 2007-09-02 13:42         ` Johan Herland
  2007-09-02 15:11           ` Reece Dunn
  2007-09-02 17:17           ` René Scharfe
  2 siblings, 2 replies; 26+ messages in thread
From: Johan Herland @ 2007-09-02 13:42 UTC (permalink / raw)
  To: git; +Cc: Junio C Hamano, Reece Dunn, Timo Sirainen, Linus Torvalds

On Friday 31 August 2007, Junio C Hamano wrote:
> It is well and widely understood idiom to use strlcpy to a
> fixed-sized buffer and checking the resulting length to make
> sure the result would not have overflowed (and if it would have,
> issue an error and die).  I would not have anything against a
> set of patches to follow such a pattern.
> 
> But a patch to add a non-standard API that nobody else uses,
> without any patch to show the changes to a few places that could
> use the API to demonstrate that the use of API vastly cleans the
> code up and makes it infinitely harder to make mistakes?
> 
> The API needs to justify itself to convince the people who needs
> to learn and adjust to that the benefit far outweighes deviation
> from better known patterns, and I do not see that happening in
> Timo's patch.

So in general, git people seem to be saying that:

1. Yes, we agree that the C string library suX0rs badly.

2. There are more than 0 string manipulation bugs (e.g. buffer overflows) in 
git. The number may be small or large, but I have yet to see anyone claim 
it's _zero_.

3. Timo's patches (in their current form) are not the way to go, because of 
non-standard API, implementation problems, whatever...

So why does the discussion end there? Lukas proposed an interesting 
alternative in "The Better String Library" ( 
http://bstring.sourceforge.net/ ). Why has there been lots of bashing on 
Timo's efforts, but no critique of bstring? I'd be very keen to know what 
the git developers think of it. AFAICS, it seems to fulfill at least _some_ 
of the problems people find in Timo's patches. Specifically, it claims:

- High performance (better than the C string library)
- Simple usage

I'd also say it's probably more widely used than Timo's patches.

If the only response to Timo's highlighting of string manipulation problems 
in git, is for us to flame his patches and leave it at that, then I have no 
choice but to agree with him in that security does not seem to matter to 
us.

...Johan

-- 
Johan Herland, <johan@herland.net>
www.herland.net

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-09-02 13:42         ` Johan Herland
@ 2007-09-02 15:11           ` Reece Dunn
  2007-09-02 15:19             ` David Kastrup
  2007-09-02 17:17           ` René Scharfe
  1 sibling, 1 reply; 26+ messages in thread
From: Reece Dunn @ 2007-09-02 15:11 UTC (permalink / raw)
  To: Johan Herland, git, Junio C Hamano, Timo Sirainen, Linus Torvalds

On 02/09/07, Johan Herland <johan@herland.net> wrote:
> On Friday 31 August 2007, Junio C Hamano wrote:
> > The API needs to justify itself to convince the people who needs
> > to learn and adjust to that the benefit far outweighes deviation
> > from better known patterns, and I do not see that happening in
> > Timo's patch.
>
> So in general, git people seem to be saying that:
>
> 1. Yes, we agree that the C string library suX0rs badly.
>
> 2. There are more than 0 string manipulation bugs (e.g. buffer overflows) in
> git. The number may be small or large, but I have yet to see anyone claim
> it's _zero_.
>
> 3. Timo's patches (in their current form) are not the way to go, because of
> non-standard API, implementation problems, whatever...
>
> So why does the discussion end there? Lukas proposed an interesting
> alternative in "The Better String Library" (
> http://bstring.sourceforge.net/ ). Why has there been lots of bashing on
> Timo's efforts, but no critique of bstring? I'd be very keen to know what
> the git developers think of it. AFAICS, it seems to fulfill at least _some_
> of the problems people find in Timo's patches. Specifically, it claims:
>
> - High performance (better than the C string library)
> - Simple usage

Performing a brief look at the documentation, the bstring library
looks promising.

It looks like it has an allocate and grow internal buffers on demand
policy. This is similar to what the C++ std::basic_string does, as
well as the string helpers in the Boost version of Jam (written in C).
This hides the buffer management from the user of the library, rather
than obfuscating it like in Timo's patch.

The API defined in the documentation is well thought out and
extensive, moreso than in the efforts by Timo and others. It has the
traditional C API, along with other API found in other string
libraries (such as split and join). I am not sure how much of git
could make use of these, but they have the pontential to simplify some
areas of the codebase.

Looking at the documentation, it is clear that this is a well thought
out library, both from the problems/security issues of the C library
and to how it compares with other string libraries. As well as
covering buffer overflow, it also deals with things like integer
overflow.

They have also done performance tests comparing the bstring library to
the C API and C++ std::string. With the C API comparison, the library
performs about 10% slower for string assignment, but other areas don't
have a slowdown. In fact, string concatenation is _considerably_
improved, something that will help git performance. I suspect (but
have not verified) that the slowdown on assignment is due to buffer
allocation.

> I'd also say it's probably more widely used than Timo's patches.

Which is good, as this means that along with the tests in the library,
it will be more stable and less likely to be buggy than something that
is written from scratch.

> If the only response to Timo's highlighting of string manipulation problems
> in git, is for us to flame his patches and leave it at that, then I have no
> choice but to agree with him in that security does not seem to matter to
> us.

I would not like to see that happen. It seems that the bstring library
will help git in more ways than security, by improving string
concatenation performance and giving a richer string API without
sacrificing performance (except where noted) and code clarity.

It would be interesting to see how the 10% performance drop on string
assignment impacts git performance, when balanced with the drastic
(92x in the performance table) increase on string concatenation.

The only major issue that I can see with bstring is that it does not
have a wchar_t version, but git is using chars internally, so this is
not a problem for git.

- Reece

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-09-02 15:11           ` Reece Dunn
@ 2007-09-02 15:19             ` David Kastrup
  2007-09-02 15:35               ` Reece Dunn
  2007-09-03  0:19               ` Jakub Narebski
  0 siblings, 2 replies; 26+ messages in thread
From: David Kastrup @ 2007-09-02 15:19 UTC (permalink / raw)
  To: Reece Dunn
  Cc: Johan Herland, git, Junio C Hamano, Timo Sirainen, Linus Torvalds

"Reece Dunn" <msclrhd@googlemail.com> writes:

> Which is good, as this means that along with the tests in the
> library, it will be more stable and less likely to be buggy than
> something that is written from scratch.

Remember git's history.

-- 
David Kastrup, Kriemhildstr. 15, 44793 Bochum

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-09-02 15:19             ` David Kastrup
@ 2007-09-02 15:35               ` Reece Dunn
  2007-09-03  0:19               ` Jakub Narebski
  1 sibling, 0 replies; 26+ messages in thread
From: Reece Dunn @ 2007-09-02 15:35 UTC (permalink / raw)
  To: David Kastrup, Johan Herland, git, Junio C Hamano, Timo Sirainen,
	"Linus Torvalds" <torvalds

On 02/09/07, David Kastrup <dak@gnu.org> wrote:
> "Reece Dunn" <msclrhd@googlemail.com> writes:
>
> > Which is good, as this means that along with the tests in the
> > library, it will be more stable and less likely to be buggy than
> > something that is written from scratch.
>
> Remember git's history.

True!

- Reece

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-09-02 13:42         ` Johan Herland
  2007-09-02 15:11           ` Reece Dunn
@ 2007-09-02 17:17           ` René Scharfe
  2007-09-02 17:39             ` Lukas Sandström
  1 sibling, 1 reply; 26+ messages in thread
From: René Scharfe @ 2007-09-02 17:17 UTC (permalink / raw)
  To: Johan Herland
  Cc: git, Junio C Hamano, Reece Dunn, Timo Sirainen, Linus Torvalds

Johan Herland schrieb:
> So why does the discussion end there? Lukas proposed an interesting 
> alternative in "The Better String Library" ( 
> http://bstring.sourceforge.net/ ). Why has there been lots of bashing on 
> Timo's efforts, but no critique of bstring? I'd be very keen to know what 
> the git developers think of it. AFAICS, it seems to fulfill at least _some_ 
> of the problems people find in Timo's patches. Specifically, it claims:
> 
> - High performance (better than the C string library)
> - Simple usage
> 
> I'd also say it's probably more widely used than Timo's patches.
> 
> 
> If the only response to Timo's highlighting of string manipulation problems 
> in git, is for us to flame his patches and leave it at that, then I have no 
> choice but to agree with him in that security does not seem to matter to 
> us.

Well, a patch (8dabdfcc) from Alex Riesen has made it into 1.5.3 which
fixes some of the problems.  That's a start.

And don't forget that we have our very own string library, viz.
strbuf.c, which could see more use.

That said, I agree that bstring looks well thought out.  It's also quite
large (lots of functions, lots of code where a bug might lurk).  Hmm.

Now if only someone could demonstrate the advantages of using bstring in
git by posting a nice patch.. :-P

René

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-09-02 17:17           ` René Scharfe
@ 2007-09-02 17:39             ` Lukas Sandström
  0 siblings, 0 replies; 26+ messages in thread
From: Lukas Sandström @ 2007-09-02 17:39 UTC (permalink / raw)
  To: René Scharfe
  Cc: Johan Herland, git, Junio C Hamano, Reece Dunn, Timo Sirainen,
	Linus Torvalds

René Scharfe wrote:
> And don't forget that we have our very own string library, viz.
> strbuf.c, which could see more use.
> 
> That said, I agree that bstring looks well thought out.  It's also quite
> large (lots of functions, lots of code where a bug might lurk).  Hmm.
> 
> Now if only someone could demonstrate the advantages of using bstring in
> git by posting a nice patch.. :-P
> 
> René

I'm currently working on rewriting builtin-mailinfo.c to use bstring. 
I'll hopefully have a proof-of-concept ready today or tomorrow.

/Lukas

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-09-02 15:19             ` David Kastrup
  2007-09-02 15:35               ` Reece Dunn
@ 2007-09-03  0:19               ` Jakub Narebski
  2007-09-03  0:31                 ` Junio C Hamano
  1 sibling, 1 reply; 26+ messages in thread
From: Jakub Narebski @ 2007-09-03  0:19 UTC (permalink / raw)
  To: git

David Kastrup wrote:

> "Reece Dunn" <msclrhd@googlemail.com> writes:
> 
>> Which is good, as this means that along with the tests in the
>> library, it will be more stable and less likely to be buggy than
>> something that is written from scratch.
> 
> Remember git's history.

On the other hand instead of doing binary diffs from a scratch, git uses
[customized] LibXDiff library. The same might be done with bstring library,
I think.

-- 
Jakub Narebski
Warsaw, Poland
ShadeHawk on #git

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Buffer overflows
  2007-09-03  0:19               ` Jakub Narebski
@ 2007-09-03  0:31                 ` Junio C Hamano
  0 siblings, 0 replies; 26+ messages in thread
From: Junio C Hamano @ 2007-09-03  0:31 UTC (permalink / raw)
  To: Jakub Narebski; +Cc: git

Jakub Narebski <jnareb@gmail.com> writes:

> David Kastrup wrote:
> ...
>> Remember git's history.
>
> On the other hand instead of doing binary diffs from a scratch, git uses
> [customized] LibXDiff library. The same might be done with bstring library,
> I think.

I am very much in favor of reusing code that aleady proved
itself in the field outside git's context.

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2007-09-03  0:31 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-08-30 19:26 Buffer overflows Timo Sirainen
2007-08-30 20:26 ` Lukas Sandström
2007-08-30 20:46 ` Linus Torvalds
2007-08-30 21:08   ` Timo Sirainen
2007-08-30 21:35     ` Reece Dunn
2007-08-30 21:51       ` Timo Sirainen
2007-08-30 22:34         ` Reece Dunn
2007-08-31 10:52           ` Wincent Colaiuta
2007-08-31 12:48             ` Simon 'corecode' Schubert
2007-08-30 22:14       ` Junio C Hamano
2007-08-30 22:36         ` Pierre Habouzit
2007-08-30 22:41         ` Timo Sirainen
2007-09-02 13:42         ` Johan Herland
2007-09-02 15:11           ` Reece Dunn
2007-09-02 15:19             ` David Kastrup
2007-09-02 15:35               ` Reece Dunn
2007-09-03  0:19               ` Jakub Narebski
2007-09-03  0:31                 ` Junio C Hamano
2007-09-02 17:17           ` René Scharfe
2007-09-02 17:39             ` Lukas Sandström
2007-08-31  4:09     ` Linus Torvalds
2007-08-31  5:00       ` Timo Sirainen
2007-08-31  9:53         ` Andreas Ericsson
2007-08-31 10:06         ` Johannes Schindelin
2007-08-30 21:48 ` [PATCH] Temporary fix for stack smashing in mailinfo Alex Riesen
2007-08-30 22:53   ` Junio C Hamano

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).