git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Junio C Hamano <gitster@pobox.com>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Miles Bader <miles@gnu.org>, Jeff King <peff@peff.net>,
	Nguyen Thai Ngoc Duy <pclouds@gmail.com>,
	git@vger.kernel.org
Subject: Re: [PATCH] grep: do not do external grep on skip-worktree entries
Date: Sun, 10 Jan 2010 22:39:36 -0800	[thread overview]
Message-ID: <7vvdf9402f.fsf@alter.siamese.dyndns.org> (raw)
In-Reply-To: <alpine.LFD.2.00.1001040801290.3630@localhost.localdomain> (Linus Torvalds's message of "Mon\, 4 Jan 2010 08\:03\:04 -0800 \(PST\)")

Linus Torvalds <torvalds@linux-foundation.org> writes:

> It doesn't matter. Since we do the line-by-line thing, the input is always 
> so short that DFA vs NFA vs BM vs other-clever-search doesn't matter. 
> There is no scaling - the grep buffer tends to be too small for the 
> algorithm to matter.
>
> And the reason we do things line-by-line is that we need to then output 
> things line-per-line.

Here is an experimental patch; first, some numbers (hot cache best of 5 runs).

(baseline -- builtin grep)

    $ time git grep --no-ext-grep qwerty
    drivers/char/keyboard.c:        "qwertyuiop[]\r\000as"...

    real    0m1.521s
    user    0m1.260s
    sys     0m0.256s

    (baseline -- external grep)

    $ time git grep --ext-grep qwerty
    drivers/char/keyboard.c:        "qwertyuiop[]\r\000as"...

    real    0m0.773s
    user    0m0.416s
    sys     0m0.376s

    (with experimental patch -- builtin grep)

    $ time ../git.git/git grep --no-ext-grep qwerty
    drivers/char/keyboard.c:        "qwertyuiop[]\r\000as"...

    real    0m0.692s
    user    0m0.440s
    sys     0m0.252s

The idea is to see the earliest location in the entire remainder of the
buffer the pattern(s) we are looking for first appears, and skip all the
lines before that point.  For this optimization to be valid, we should not
be looking for anything complex (e.g. "find lines that have both X and Y
but not Z" is out).  We cannot use the optimization when "-v" is given,
either, because then we need to find the first line that does _not_ match
given set of patterns.

---

 grep.c |   64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 64 insertions(+), 0 deletions(-)

diff --git a/grep.c b/grep.c
index bdadf2c..940e200 100644
--- a/grep.c
+++ b/grep.c
@@ -615,6 +615,65 @@ static void show_pre_context(struct grep_opt *opt, const char *name, char *buf,
 	}
 }
 
+static int should_lookahead(struct grep_opt *opt)
+{
+	struct grep_pat *p;
+
+	if (opt->extended)
+		return 0; /* punt for too complex stuff */
+	if (opt->invert || opt->unmatch_name_only)
+		return 0;
+	for (p = opt->pattern_list; p; p = p->next) {
+		if (p->token != GREP_PATTERN)
+			return 0; /* punt for "header only" and stuff */
+	}
+	return 1;
+}
+
+static int look_ahead(struct grep_opt *opt,
+		      unsigned long *left_p,
+		      unsigned *lno_p,
+		      char **bol_p)
+{
+	unsigned lno = *lno_p;
+	char *bol = *bol_p;
+	struct grep_pat *p;
+	char *sp, *last_bol;
+	regoff_t earliest = -1;
+
+	for (p = opt->pattern_list; p; p = p->next) {
+		int hit;
+		regmatch_t m;
+
+		if (p->fixed)
+			hit = !fixmatch(p->pattern, bol, p->ignore_case, &m);
+		else
+			hit = !regexec(&p->regexp, bol, 1, &m, 0);
+		if (!hit || m.rm_so < 0 || m.rm_eo < 0)
+			continue;
+		if (earliest < 0 || m.rm_so < earliest)
+			earliest = m.rm_so;
+	}
+
+	if (earliest < 0) {
+		*bol_p = bol + *left_p;
+		*left_p = 0;
+		return 1;
+	}
+	for (sp = bol + earliest; bol < sp && sp[-1] != '\n'; sp--)
+		; /* find the beginning of the line */
+	last_bol = sp;
+
+	for (sp = bol; sp < last_bol; sp++) {
+		if (*sp == '\n')
+			lno++;
+	}
+	*left_p -= last_bol - bol;
+	*bol_p = last_bol;
+	*lno_p = lno;
+	return 0;
+}
+
 static int grep_buffer_1(struct grep_opt *opt, const char *name,
 			 char *buf, unsigned long size, int collect_hits)
 {
@@ -624,6 +683,7 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name,
 	unsigned last_hit = 0;
 	int binary_match_only = 0;
 	unsigned count = 0;
+	int try_lookahead = 0;
 	enum grep_context ctx = GREP_CONTEXT_HEAD;
 	xdemitconf_t xecfg;
 
@@ -652,11 +712,15 @@ static int grep_buffer_1(struct grep_opt *opt, const char *name,
 			opt->priv = &xecfg;
 		}
 	}
+	try_lookahead = should_lookahead(opt);
 
 	while (left) {
 		char *eol, ch;
 		int hit;
 
+		if (try_lookahead
+		    && look_ahead(opt, &left, &lno, &bol))
+			break;
 		eol = end_of_line(bol, &left);
 		ch = *eol;
 		*eol = 0;

  reply	other threads:[~2010-01-11  6:39 UTC|newest]

Thread overview: 60+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-12-30 14:11 [PATCH] grep: do not do external grep on skip-worktree entries Nguyễn Thái Ngọc Duy
2009-12-31  7:01 ` Junio C Hamano
2009-12-31  7:09   ` Junio C Hamano
2010-01-02 11:50     ` Nguyen Thai Ngoc Duy
2010-01-02 18:44       ` Junio C Hamano
2010-01-02 19:15         ` Nguyen Thai Ngoc Duy
2010-01-02 19:45           ` Junio C Hamano
2010-01-03  2:35             ` Miles Bader
2010-01-03  2:47               ` Miles Bader
2010-01-03  3:08                 ` Miles Bader
2010-01-03 19:32                   ` Linus Torvalds
2010-01-03 20:49                     ` Junio C Hamano
2010-01-04  5:31                       ` Jeff King
2010-01-04  5:52                         ` Junio C Hamano
2010-01-04  6:44                           ` Jeff King
2010-01-04  7:08                             ` Junio C Hamano
2010-01-04  7:14                               ` Junio C Hamano
2010-01-04  7:29                                 ` Jeff King
2010-01-04  7:26                               ` Jeff King
2010-01-04  8:09                                 ` Jeff King
2010-01-04 16:01                                   ` Linus Torvalds
2010-01-04 15:54                             ` Linus Torvalds
2010-01-04 15:57                               ` Miles Bader
2010-01-04 16:03                                 ` Linus Torvalds
2010-01-11  6:39                                   ` Junio C Hamano [this message]
2010-01-11 15:43                                     ` Linus Torvalds
2010-01-11 15:59                                       ` Linus Torvalds
2010-01-11 16:22                                         ` Junio C Hamano
2010-01-11 16:24                                           ` Junio C Hamano
2010-01-11 16:33                                           ` Linus Torvalds
2010-01-12  8:29                                             ` Junio C Hamano
2010-01-12  8:31                                               ` [PATCH] grep: lookahead optimization can be used with -L option Junio C Hamano
2010-01-12  8:32                                               ` [PATCH] grep: -L should show empty files Junio C Hamano
2010-01-12 21:27                                                 ` Sverre Rabbelier
2010-01-13  6:56                                                   ` Junio C Hamano
2010-01-13 16:04                                                     ` Sverre Rabbelier
2010-01-13 19:48                                                       ` Junio C Hamano
2010-01-13  6:48                                               ` [PATCH 1/2] grep: rip out support for external grep Junio C Hamano
2010-01-13  8:29                                                 ` Jay Soffian
2010-01-13  8:59                                                   ` Junio C Hamano
2010-01-13 15:20                                                 ` Linus Torvalds
2010-01-13  6:51                                               ` [PATCH 2/2] grep: rip out pessimization to use fixmatch() Junio C Hamano
2010-01-12 16:21                                         ` [PATCH] grep: do not do external grep on skip-worktree entries Jeff King
2010-01-11 19:26                                     ` Fredrik Kuivinen
     [not found]                                     ` <4c8ef71001111119p253170f8q37bcd3708d894a62@mail.gmail.com>
2010-01-11 19:29                                       ` Linus Torvalds
2010-01-11 19:40                                         ` Fredrik Kuivinen
2010-01-11 20:07                                           ` Linus Torvalds
2010-01-11 21:07                                             ` Fredrik Kuivinen
2010-01-11 21:24                                               ` Linus Torvalds
2010-01-04 16:24                               ` Linus Torvalds
2010-01-04 10:14                           ` Nguyen Thai Ngoc Duy
2010-01-04  6:06                     ` Mike Hommey
2010-01-04  7:04                       ` Jeff King
2010-01-04 12:34             ` [PATCH 1/2] t7002: set test prerequisite "external-grep" if supported Nguyễn Thái Ngọc Duy
2010-01-07  2:37               ` Junio C Hamano
2010-01-07  4:29                 ` Junio C Hamano
2010-01-07 13:27                   ` Nguyen Thai Ngoc Duy
2010-01-07 14:04                     ` Johannes Sixt
2010-01-07 14:26                       ` Nguyen Thai Ngoc Duy
2010-01-04 12:34             ` [PATCH 2/2] t7002: add tests for skip-worktree fixes in commit a67e281 Nguyễn Thái Ngọc Duy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=7vvdf9402f.fsf@alter.siamese.dyndns.org \
    --to=gitster@pobox.com \
    --cc=git@vger.kernel.org \
    --cc=miles@gnu.org \
    --cc=pclouds@gmail.com \
    --cc=peff@peff.net \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).