* [PATCH 1/4] pathspec: save the non-wildcard length part
2012-11-18 9:13 [PATCH 0/4] Some pathspec wildcard optimization Nguyễn Thái Ngọc Duy
@ 2012-11-18 9:13 ` Nguyễn Thái Ngọc Duy
2012-11-19 20:48 ` Junio C Hamano
2012-11-18 9:13 ` [PATCH 2/4] pathspec: do exact comparison on the leading non-wildcard part Nguyễn Thái Ngọc Duy
` (2 subsequent siblings)
3 siblings, 1 reply; 11+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-11-18 9:13 UTC (permalink / raw)
To: git; +Cc: Nguyễn Thái Ngọc Duy
We marks pathspec with wildcards with the field use_wildcard. We could
do better by saving the length of the non-wildcard part, which can be
for optimizations such as f9f6e2c (exclude: do strcmp as much as
possible before fnmatch - 2012-06-07)
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
builtin/ls-files.c | 2 +-
builtin/ls-tree.c | 2 +-
cache.h | 2 +-
dir.c | 6 +++---
tree-walk.c | 4 ++--
5 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/builtin/ls-files.c b/builtin/ls-files.c
index b5434af..4a9ee69 100644
--- a/builtin/ls-files.c
+++ b/builtin/ls-files.c
@@ -337,7 +337,7 @@ void overlay_tree_on_cache(const char *tree_name, const char *prefix)
matchbuf[0] = prefix;
matchbuf[1] = NULL;
init_pathspec(&pathspec, matchbuf);
- pathspec.items[0].use_wildcard = 0;
+ pathspec.items[0].nowildcard_len = pathspec.items[0].len;
} else
init_pathspec(&pathspec, NULL);
if (read_tree(tree, 1, &pathspec))
diff --git a/builtin/ls-tree.c b/builtin/ls-tree.c
index 235c17c..fb76e38 100644
--- a/builtin/ls-tree.c
+++ b/builtin/ls-tree.c
@@ -168,7 +168,7 @@ int cmd_ls_tree(int argc, const char **argv, const char *prefix)
init_pathspec(&pathspec, get_pathspec(prefix, argv + 1));
for (i = 0; i < pathspec.nr; i++)
- pathspec.items[i].use_wildcard = 0;
+ pathspec.items[i].nowildcard_len = pathspec.items[i].len;
pathspec.has_wildcard = 0;
tree = parse_tree_indirect(sha1);
if (!tree)
diff --git a/cache.h b/cache.h
index dbd8018..bf031f1 100644
--- a/cache.h
+++ b/cache.h
@@ -482,7 +482,7 @@ struct pathspec {
struct pathspec_item {
const char *match;
int len;
- unsigned int use_wildcard:1;
+ int nowildcard_len;
} *items;
};
diff --git a/dir.c b/dir.c
index 5a83aa7..c391d46 100644
--- a/dir.c
+++ b/dir.c
@@ -230,7 +230,7 @@ static int match_pathspec_item(const struct pathspec_item *item, int prefix,
return MATCHED_RECURSIVELY;
}
- if (item->use_wildcard && !fnmatch(match, name, 0))
+ if (item->nowildcard_len < item->len && !fnmatch(match, name, 0))
return MATCHED_FNMATCH;
return 0;
@@ -1429,8 +1429,8 @@ int init_pathspec(struct pathspec *pathspec, const char **paths)
item->match = path;
item->len = strlen(path);
- item->use_wildcard = !no_wildcard(path);
- if (item->use_wildcard)
+ item->nowildcard_len = simple_length(path);
+ if (item->nowildcard_len < item->len)
pathspec->has_wildcard = 1;
}
diff --git a/tree-walk.c b/tree-walk.c
index 3f54c02..af871c5 100644
--- a/tree-walk.c
+++ b/tree-walk.c
@@ -626,7 +626,7 @@ enum interesting tree_entry_interesting(const struct name_entry *entry,
&never_interesting))
return entry_interesting;
- if (item->use_wildcard) {
+ if (item->nowildcard_len < item->len) {
if (!fnmatch(match + baselen, entry->path, 0))
return entry_interesting;
@@ -642,7 +642,7 @@ enum interesting tree_entry_interesting(const struct name_entry *entry,
}
match_wildcards:
- if (!item->use_wildcard)
+ if (item->nowildcard_len == item->len)
continue;
/*
--
1.8.0.rc2.23.g1fb49df
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH 1/4] pathspec: save the non-wildcard length part
2012-11-18 9:13 ` [PATCH 1/4] pathspec: save the non-wildcard length part Nguyễn Thái Ngọc Duy
@ 2012-11-19 20:48 ` Junio C Hamano
0 siblings, 0 replies; 11+ messages in thread
From: Junio C Hamano @ 2012-11-19 20:48 UTC (permalink / raw)
To: Nguyễn Thái Ngọc Duy; +Cc: git
Nguyễn Thái Ngọc Duy <pclouds@gmail.com> writes:
> We marks pathspec with wildcards with the field use_wildcard. We could
s/marks/mark;
> do better by saving the length of the non-wildcard part, which can be
> for optimizations such as f9f6e2c (exclude: do strcmp as much as
s/for /used &/;
> possible before fnmatch - 2012-06-07)
>
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
The code looks straightforward and correct. Thanks.
> ---
> builtin/ls-files.c | 2 +-
> builtin/ls-tree.c | 2 +-
> cache.h | 2 +-
> dir.c | 6 +++---
> tree-walk.c | 4 ++--
> 5 files changed, 8 insertions(+), 8 deletions(-)
>
> diff --git a/builtin/ls-files.c b/builtin/ls-files.c
> index b5434af..4a9ee69 100644
> --- a/builtin/ls-files.c
> +++ b/builtin/ls-files.c
> @@ -337,7 +337,7 @@ void overlay_tree_on_cache(const char *tree_name, const char *prefix)
> matchbuf[0] = prefix;
> matchbuf[1] = NULL;
> init_pathspec(&pathspec, matchbuf);
> - pathspec.items[0].use_wildcard = 0;
> + pathspec.items[0].nowildcard_len = pathspec.items[0].len;
> } else
> init_pathspec(&pathspec, NULL);
> if (read_tree(tree, 1, &pathspec))
> diff --git a/builtin/ls-tree.c b/builtin/ls-tree.c
> index 235c17c..fb76e38 100644
> --- a/builtin/ls-tree.c
> +++ b/builtin/ls-tree.c
> @@ -168,7 +168,7 @@ int cmd_ls_tree(int argc, const char **argv, const char *prefix)
>
> init_pathspec(&pathspec, get_pathspec(prefix, argv + 1));
> for (i = 0; i < pathspec.nr; i++)
> - pathspec.items[i].use_wildcard = 0;
> + pathspec.items[i].nowildcard_len = pathspec.items[i].len;
> pathspec.has_wildcard = 0;
> tree = parse_tree_indirect(sha1);
> if (!tree)
> diff --git a/cache.h b/cache.h
> index dbd8018..bf031f1 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -482,7 +482,7 @@ struct pathspec {
> struct pathspec_item {
> const char *match;
> int len;
> - unsigned int use_wildcard:1;
> + int nowildcard_len;
> } *items;
> };
>
> diff --git a/dir.c b/dir.c
> index 5a83aa7..c391d46 100644
> --- a/dir.c
> +++ b/dir.c
> @@ -230,7 +230,7 @@ static int match_pathspec_item(const struct pathspec_item *item, int prefix,
> return MATCHED_RECURSIVELY;
> }
>
> - if (item->use_wildcard && !fnmatch(match, name, 0))
> + if (item->nowildcard_len < item->len && !fnmatch(match, name, 0))
> return MATCHED_FNMATCH;
>
> return 0;
> @@ -1429,8 +1429,8 @@ int init_pathspec(struct pathspec *pathspec, const char **paths)
>
> item->match = path;
> item->len = strlen(path);
> - item->use_wildcard = !no_wildcard(path);
> - if (item->use_wildcard)
> + item->nowildcard_len = simple_length(path);
> + if (item->nowildcard_len < item->len)
> pathspec->has_wildcard = 1;
> }
>
> diff --git a/tree-walk.c b/tree-walk.c
> index 3f54c02..af871c5 100644
> --- a/tree-walk.c
> +++ b/tree-walk.c
> @@ -626,7 +626,7 @@ enum interesting tree_entry_interesting(const struct name_entry *entry,
> &never_interesting))
> return entry_interesting;
>
> - if (item->use_wildcard) {
> + if (item->nowildcard_len < item->len) {
> if (!fnmatch(match + baselen, entry->path, 0))
> return entry_interesting;
>
> @@ -642,7 +642,7 @@ enum interesting tree_entry_interesting(const struct name_entry *entry,
> }
>
> match_wildcards:
> - if (!item->use_wildcard)
> + if (item->nowildcard_len == item->len)
> continue;
>
> /*
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 2/4] pathspec: do exact comparison on the leading non-wildcard part
2012-11-18 9:13 [PATCH 0/4] Some pathspec wildcard optimization Nguyễn Thái Ngọc Duy
2012-11-18 9:13 ` [PATCH 1/4] pathspec: save the non-wildcard length part Nguyễn Thái Ngọc Duy
@ 2012-11-18 9:13 ` Nguyễn Thái Ngọc Duy
2012-11-19 20:54 ` Junio C Hamano
2012-11-18 9:13 ` [PATCH 3/4] pathspec: apply "*.c" optimization from exclude Nguyễn Thái Ngọc Duy
2012-11-18 9:13 ` [PATCH 4/4] tree_entry_interesting: do basedir compare on wildcard patterns when possible Nguyễn Thái Ngọc Duy
3 siblings, 1 reply; 11+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-11-18 9:13 UTC (permalink / raw)
To: git; +Cc: Nguyễn Thái Ngọc Duy
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
dir.c | 18 +++++++++++++++++-
dir.h | 8 ++++++++
tree-walk.c | 6 ++++--
3 files changed, 29 insertions(+), 3 deletions(-)
diff --git a/dir.c b/dir.c
index c391d46..e4e6ca1 100644
--- a/dir.c
+++ b/dir.c
@@ -34,6 +34,21 @@ int fnmatch_icase(const char *pattern, const char *string, int flags)
return fnmatch(pattern, string, flags | (ignore_case ? FNM_CASEFOLD : 0));
}
+inline int git_fnmatch(const char *pattern, const char *string,
+ int flags, int prefix)
+{
+ int fnm_flags = 0;
+ if (flags & GF_PATHNAME)
+ fnm_flags |= FNM_PATHNAME;
+ if (prefix > 0) {
+ if (strncmp(pattern, string, prefix))
+ return FNM_NOMATCH;
+ pattern += prefix;
+ string += prefix;
+ }
+ return fnmatch(pattern, string, fnm_flags);
+}
+
static size_t common_prefix_len(const char **pathspec)
{
const char *n, *first;
@@ -230,7 +245,8 @@ static int match_pathspec_item(const struct pathspec_item *item, int prefix,
return MATCHED_RECURSIVELY;
}
- if (item->nowildcard_len < item->len && !fnmatch(match, name, 0))
+ if (item->nowildcard_len < item->len &&
+ !git_fnmatch(match, name, 0, item->nowildcard_len - prefix))
return MATCHED_FNMATCH;
return 0;
diff --git a/dir.h b/dir.h
index f5c89e3..4cd5074 100644
--- a/dir.h
+++ b/dir.h
@@ -139,4 +139,12 @@ extern int strcmp_icase(const char *a, const char *b);
extern int strncmp_icase(const char *a, const char *b, size_t count);
extern int fnmatch_icase(const char *pattern, const char *string, int flags);
+/*
+ * The prefix part of pattern must not contains wildcards.
+ */
+#define GF_PATHNAME 1
+
+extern int git_fnmatch(const char *pattern, const char *string,
+ int flags, int prefix);
+
#endif
diff --git a/tree-walk.c b/tree-walk.c
index af871c5..2fcf3c0 100644
--- a/tree-walk.c
+++ b/tree-walk.c
@@ -627,7 +627,8 @@ enum interesting tree_entry_interesting(const struct name_entry *entry,
return entry_interesting;
if (item->nowildcard_len < item->len) {
- if (!fnmatch(match + baselen, entry->path, 0))
+ if (!git_fnmatch(match + baselen, entry->path,
+ 0, item->nowildcard_len - baselen))
return entry_interesting;
/*
@@ -652,7 +653,8 @@ match_wildcards:
strbuf_add(base, entry->path, pathlen);
- if (!fnmatch(match, base->buf + base_offset, 0)) {
+ if (!git_fnmatch(match, base->buf + base_offset,
+ 0, item->nowildcard_len)) {
strbuf_setlen(base, base_offset + baselen);
return entry_interesting;
}
--
1.8.0.rc2.23.g1fb49df
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH 2/4] pathspec: do exact comparison on the leading non-wildcard part
2012-11-18 9:13 ` [PATCH 2/4] pathspec: do exact comparison on the leading non-wildcard part Nguyễn Thái Ngọc Duy
@ 2012-11-19 20:54 ` Junio C Hamano
2012-11-20 13:09 ` Nguyen Thai Ngoc Duy
0 siblings, 1 reply; 11+ messages in thread
From: Junio C Hamano @ 2012-11-19 20:54 UTC (permalink / raw)
To: Nguyễn Thái Ngọc Duy; +Cc: git
Nguyễn Thái Ngọc Duy <pclouds@gmail.com> writes:
> Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
> ---
> dir.c | 18 +++++++++++++++++-
> dir.h | 8 ++++++++
> tree-walk.c | 6 ++++--
> 3 files changed, 29 insertions(+), 3 deletions(-)
>
> diff --git a/dir.c b/dir.c
> index c391d46..e4e6ca1 100644
> --- a/dir.c
> +++ b/dir.c
> @@ -34,6 +34,21 @@ int fnmatch_icase(const char *pattern, const char *string, int flags)
> return fnmatch(pattern, string, flags | (ignore_case ? FNM_CASEFOLD : 0));
> }
>
> +inline int git_fnmatch(const char *pattern, const char *string,
> + int flags, int prefix)
> +{
> + int fnm_flags = 0;
> + if (flags & GF_PATHNAME)
> + fnm_flags |= FNM_PATHNAME;
> + if (prefix > 0) {
> + if (strncmp(pattern, string, prefix))
> + return FNM_NOMATCH;
> + pattern += prefix;
> + string += prefix;
> + }
> + return fnmatch(pattern, string, fnm_flags);
How would we protect this optimization from future breakages?
Once we start using FNM_PERIOD, this becomes unsafe, as the simple
part in "foo/bar*baz" would be "foo/bar" with remainder "*baz".
The pattern "foo/bar*baz" should match "foo/bar.baz" with FNM_PERIOD
set. But with this optimization, fnmatch is fed pattern="*baz",
string=".baz" and they would not match.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 2/4] pathspec: do exact comparison on the leading non-wildcard part
2012-11-19 20:54 ` Junio C Hamano
@ 2012-11-20 13:09 ` Nguyen Thai Ngoc Duy
0 siblings, 0 replies; 11+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-11-20 13:09 UTC (permalink / raw)
To: Junio C Hamano; +Cc: git
On Tue, Nov 20, 2012 at 3:54 AM, Junio C Hamano <gitster@pobox.com> wrote:
> How would we protect this optimization from future breakages?
>
> Once we start using FNM_PERIOD, this becomes unsafe, as the simple
> part in "foo/bar*baz" would be "foo/bar" with remainder "*baz".
>
> The pattern "foo/bar*baz" should match "foo/bar.baz" with FNM_PERIOD
> set. But with this optimization, fnmatch is fed pattern="*baz",
> string=".baz" and they would not match.
The first question is how FNM_PERIOD comes in the first place without
breaking the current syntax. I guess we just turn off the optimization
if FNM_PERIOD is used.
My secret "plan" is to convert all fnmatch() calls to git_fnmatch()
then replace fnmatch() with wildmatch() and move these optimization
down to wildmatch(). I think we can handle FNM_PERIOD case better down
there because string is still "foo/bar.baz".
--
Duy
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 3/4] pathspec: apply "*.c" optimization from exclude
2012-11-18 9:13 [PATCH 0/4] Some pathspec wildcard optimization Nguyễn Thái Ngọc Duy
2012-11-18 9:13 ` [PATCH 1/4] pathspec: save the non-wildcard length part Nguyễn Thái Ngọc Duy
2012-11-18 9:13 ` [PATCH 2/4] pathspec: do exact comparison on the leading non-wildcard part Nguyễn Thái Ngọc Duy
@ 2012-11-18 9:13 ` Nguyễn Thái Ngọc Duy
2012-11-19 21:20 ` Junio C Hamano
2012-11-18 9:13 ` [PATCH 4/4] tree_entry_interesting: do basedir compare on wildcard patterns when possible Nguyễn Thái Ngọc Duy
3 siblings, 1 reply; 11+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-11-18 9:13 UTC (permalink / raw)
To: git; +Cc: Nguyễn Thái Ngọc Duy
-O2 build on linux-2.6, without the patch:
$ time git rev-list --quiet HEAD -- '*.c'
real 0m40.770s
user 0m40.290s
sys 0m0.256s
With the patch
$ time ~/w/git/git rev-list --quiet HEAD -- '*.c'
real 0m34.288s
user 0m33.997s
sys 0m0.205s
The above command is not supposed to be widely popular. It's chosen
because it exercises pathspec matching a lot. The point is it cuts
down matching time for popular patterns like *.c, which could be used
as pathspec in other places.
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
cache.h | 3 +++
dir.c | 17 +++++++++++++++--
dir.h | 1 +
tree-walk.c | 6 ++++--
4 files changed, 23 insertions(+), 4 deletions(-)
diff --git a/cache.h b/cache.h
index bf031f1..d18f584 100644
--- a/cache.h
+++ b/cache.h
@@ -473,6 +473,8 @@ extern int index_name_is_other(const struct index_state *, const char *, int);
extern int ie_match_stat(const struct index_state *, struct cache_entry *, struct stat *, unsigned int);
extern int ie_modified(const struct index_state *, struct cache_entry *, struct stat *, unsigned int);
+#define PSF_ONESTAR 1
+
struct pathspec {
const char **raw; /* get_pathspec() result, not freed by free_pathspec() */
int nr;
@@ -483,6 +485,7 @@ struct pathspec {
const char *match;
int len;
int nowildcard_len;
+ int flags;
} *items;
};
diff --git a/dir.c b/dir.c
index e4e6ca1..d00f240 100644
--- a/dir.c
+++ b/dir.c
@@ -46,6 +46,12 @@ inline int git_fnmatch(const char *pattern, const char *string,
pattern += prefix;
string += prefix;
}
+ if (flags & GF_ONESTAR) {
+ int pattern_len = strlen(++pattern);
+ int string_len = strlen(string);
+ return strcmp(pattern,
+ string + string_len - pattern_len);
+ }
return fnmatch(pattern, string, fnm_flags);
}
@@ -246,7 +252,9 @@ static int match_pathspec_item(const struct pathspec_item *item, int prefix,
}
if (item->nowildcard_len < item->len &&
- !git_fnmatch(match, name, 0, item->nowildcard_len - prefix))
+ !git_fnmatch(match, name,
+ item->flags & PSF_ONESTAR ? GF_ONESTAR : 0,
+ item->nowildcard_len - prefix))
return MATCHED_FNMATCH;
return 0;
@@ -1446,8 +1454,13 @@ int init_pathspec(struct pathspec *pathspec, const char **paths)
item->match = path;
item->len = strlen(path);
item->nowildcard_len = simple_length(path);
- if (item->nowildcard_len < item->len)
+ item->flags = 0;
+ if (item->nowildcard_len < item->len) {
pathspec->has_wildcard = 1;
+ if (path[item->nowildcard_len] == '*' &&
+ no_wildcard(path + item->nowildcard_len + 1))
+ item->flags |= PSF_ONESTAR;
+ }
}
qsort(pathspec->items, pathspec->nr,
diff --git a/dir.h b/dir.h
index 4cd5074..590532b 100644
--- a/dir.h
+++ b/dir.h
@@ -143,6 +143,7 @@ extern int fnmatch_icase(const char *pattern, const char *string, int flags);
* The prefix part of pattern must not contains wildcards.
*/
#define GF_PATHNAME 1
+#define GF_ONESTAR 2
extern int git_fnmatch(const char *pattern, const char *string,
int flags, int prefix);
diff --git a/tree-walk.c b/tree-walk.c
index 2fcf3c0..42fe610 100644
--- a/tree-walk.c
+++ b/tree-walk.c
@@ -628,7 +628,8 @@ enum interesting tree_entry_interesting(const struct name_entry *entry,
if (item->nowildcard_len < item->len) {
if (!git_fnmatch(match + baselen, entry->path,
- 0, item->nowildcard_len - baselen))
+ item->flags & PSF_ONESTAR ? GF_ONESTAR : 0,
+ item->nowildcard_len - baselen))
return entry_interesting;
/*
@@ -654,7 +655,8 @@ match_wildcards:
strbuf_add(base, entry->path, pathlen);
if (!git_fnmatch(match, base->buf + base_offset,
- 0, item->nowildcard_len)) {
+ item->flags & PSF_ONESTAR ? GF_ONESTAR : 0,
+ item->nowildcard_len)) {
strbuf_setlen(base, base_offset + baselen);
return entry_interesting;
}
--
1.8.0.rc2.23.g1fb49df
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH 3/4] pathspec: apply "*.c" optimization from exclude
2012-11-18 9:13 ` [PATCH 3/4] pathspec: apply "*.c" optimization from exclude Nguyễn Thái Ngọc Duy
@ 2012-11-19 21:20 ` Junio C Hamano
2012-11-20 13:23 ` Nguyen Thai Ngoc Duy
0 siblings, 1 reply; 11+ messages in thread
From: Junio C Hamano @ 2012-11-19 21:20 UTC (permalink / raw)
To: Nguyễn Thái Ngọc Duy; +Cc: git
Nguyễn Thái Ngọc Duy <pclouds@gmail.com> writes:
> -O2 build on linux-2.6, without the patch:
Before the result, can you briefly explain what '"*.c" optimization
from exclude' the title talks about is?
When a pattern contains only a single asterisk, e.g. "foo*bar",
after literally comparing the leading part "foo" with the
string, we can compare the tail of the string and make sure it
matches "bar", instead of running fnmatch() on "*bar" against
the remainder of the string.
The funny thing was that trying to explain what the logic should do
makes one realize potential issues in the implementation of that
logic ;-) See below.
> $ time git rev-list --quiet HEAD -- '*.c'
>
> real 0m40.770s
> user 0m40.290s
> sys 0m0.256s
>
> With the patch
>
> $ time ~/w/git/git rev-list --quiet HEAD -- '*.c'
>
> real 0m34.288s
> user 0m33.997s
> sys 0m0.205s
>
> The above command is not supposed to be widely popular.
Hrm, perhaps. I use "git grep <pattern> -- \*.c" quite a lot, but
haven't seen use case for \*.c in the context of the "log" family.
> diff --git a/cache.h b/cache.h
> index bf031f1..d18f584 100644
> --- a/cache.h
> +++ b/cache.h
> @@ -473,6 +473,8 @@ extern int index_name_is_other(const struct index_state *, const char *, int);
> extern int ie_match_stat(const struct index_state *, struct cache_entry *, struct stat *, unsigned int);
> extern int ie_modified(const struct index_state *, struct cache_entry *, struct stat *, unsigned int);
>
> +#define PSF_ONESTAR 1
Together with the GF_ prefix in the previous, PSF_ prefix needs a
bit of in-code explanation. Is it just an RC3L (random combination
of 3 letters?)
> @@ -46,6 +46,12 @@ inline int git_fnmatch(const char *pattern, const char *string,
> pattern += prefix;
> string += prefix;
> }
> + if (flags & GF_ONESTAR) {
> + int pattern_len = strlen(++pattern);
> + int string_len = strlen(string);
> + return strcmp(pattern,
> + string + string_len - pattern_len);
> + }
What happens when pattern="foo*oob" and string="foob"?
The prefix match before this code determines that the literal prefix
in the pattern matches with the early part of the string, and makes
pattern="*oob" and string="b".
When you come to strcmp(), you see that string_len is 1, pattern_len
is 3, and pattern is "oob". string+string_len-pattern_len = "oob",
one past the beginning of the original string "foob". They match.
Oops?
return (string_len < pattern_len) ||
strcmp(pattern, string + string_len - pattern_len);
perhaps?
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 3/4] pathspec: apply "*.c" optimization from exclude
2012-11-19 21:20 ` Junio C Hamano
@ 2012-11-20 13:23 ` Nguyen Thai Ngoc Duy
2012-11-20 19:47 ` Junio C Hamano
0 siblings, 1 reply; 11+ messages in thread
From: Nguyen Thai Ngoc Duy @ 2012-11-20 13:23 UTC (permalink / raw)
To: Junio C Hamano; +Cc: git
On Tue, Nov 20, 2012 at 4:20 AM, Junio C Hamano <gitster@pobox.com> wrote:
>> $ time git rev-list --quiet HEAD -- '*.c'
>>
>> real 0m40.770s
>> user 0m40.290s
>> sys 0m0.256s
>>
>> With the patch
>>
>> $ time ~/w/git/git rev-list --quiet HEAD -- '*.c'
>>
>> real 0m34.288s
>> user 0m33.997s
>> sys 0m0.205s
>>
>> The above command is not supposed to be widely popular.
>
> Hrm, perhaps. I use "git grep <pattern> -- \*.c" quite a lot, but
> haven't seen use case for \*.c in the context of the "log" family.
"git diff *.c" is also helpful (maybe "git diff *.[ch]" is more often
used). But I suspect I/O dominates in both grep and diff cases. I just
try to make sure matching code won't show up in profile.
>> +#define PSF_ONESTAR 1
>
> Together with the GF_ prefix in the previous, PSF_ prefix needs a
> bit of in-code explanation. Is it just an RC3L (random combination
> of 3 letters?)
I'm pretty sure "PS" stands for pathspec. "F" is probably from
fnmatch. Any suggestions?
>> @@ -46,6 +46,12 @@ inline int git_fnmatch(const char *pattern, const char *string,
>> pattern += prefix;
>> string += prefix;
>> }
>> + if (flags & GF_ONESTAR) {
>> + int pattern_len = strlen(++pattern);
>> + int string_len = strlen(string);
>> + return strcmp(pattern,
>> + string + string_len - pattern_len);
>> + }
>
> What happens when pattern="foo*oob" and string="foob"?
>
> The prefix match before this code determines that the literal prefix
> in the pattern matches with the early part of the string, and makes
> pattern="*oob" and string="b".
>
> When you come to strcmp(), you see that string_len is 1, pattern_len
> is 3, and pattern is "oob". string+string_len-pattern_len = "oob",
> one past the beginning of the original string "foob". They match.
>
> Oops?
Oops indead. I'll need to check exclude code too :(
> return (string_len < pattern_len) ||
> strcmp(pattern, string + string_len - pattern_len);
>
> perhaps?
Yeah.
--
Duy
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 3/4] pathspec: apply "*.c" optimization from exclude
2012-11-20 13:23 ` Nguyen Thai Ngoc Duy
@ 2012-11-20 19:47 ` Junio C Hamano
0 siblings, 0 replies; 11+ messages in thread
From: Junio C Hamano @ 2012-11-20 19:47 UTC (permalink / raw)
To: Nguyen Thai Ngoc Duy; +Cc: git
Nguyen Thai Ngoc Duy <pclouds@gmail.com> writes:
>> When you come to strcmp(), you see that string_len is 1, pattern_len
>> is 3, and pattern is "oob". string+string_len-pattern_len = "oob",
>> one past the beginning of the original string "foob". They match.
>>
>> Oops?
>
> Oops indead. I'll need to check exclude code too :(
Ok, the one queued on 'pu' has been locally fixed when I sent the
message you are responding to.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 4/4] tree_entry_interesting: do basedir compare on wildcard patterns when possible
2012-11-18 9:13 [PATCH 0/4] Some pathspec wildcard optimization Nguyễn Thái Ngọc Duy
` (2 preceding siblings ...)
2012-11-18 9:13 ` [PATCH 3/4] pathspec: apply "*.c" optimization from exclude Nguyễn Thái Ngọc Duy
@ 2012-11-18 9:13 ` Nguyễn Thái Ngọc Duy
3 siblings, 0 replies; 11+ messages in thread
From: Nguyễn Thái Ngọc Duy @ 2012-11-18 9:13 UTC (permalink / raw)
To: git; +Cc: Nguyễn Thái Ngọc Duy
Currently we treat "*.c" and "path/to/*.c" the same way. Which means
we check all possible paths in repo against "path/to/*.c". One could
see that "path/elsewhere/foo.c" obviously cannot match "path/to/*.c"
and we only need to check all paths _inside_ "path/to/" against that
pattern.
This patch checks the leading fixed part of a pathspec against base
directory and exit early if possible. We could even optimize further
in "path/to/something*.c" case (i.e. check the fixed part against
name_entry as well) but that's more complicated and probably does not
gain us much.
-O2 build on linux-2.6, without and with this patch respectively:
$ time git rev-list --quiet HEAD -- 'drivers/*.c'
real 1m9.484s
user 1m9.128s
sys 0m0.181s
$ time ~/w/git/git rev-list --quiet HEAD -- 'drivers/*.c'
real 0m15.710s
user 0m15.564s
sys 0m0.107s
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
tree-walk.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 62 insertions(+), 1 deletion(-)
diff --git a/tree-walk.c b/tree-walk.c
index 42fe610..dcc1015 100644
--- a/tree-walk.c
+++ b/tree-walk.c
@@ -573,6 +573,52 @@ static int match_dir_prefix(const char *base,
}
/*
+ * Perform matching on the leading non-wildcard part of
+ * pathspec. item->nowildcard_len must be greater than zero. Return
+ * non-zero if base is matched.
+ */
+static int match_wildcard_base(const struct pathspec_item *item,
+ const char *base, int baselen,
+ int *matched)
+{
+ const char *match = item->match;
+ /* the wildcard part is not considered in this function */
+ int matchlen = item->nowildcard_len;
+
+ if (baselen) {
+ int dirlen;
+ /*
+ * Return early if base is longer than the
+ * non-wildcard part but it does not match.
+ */
+ if (baselen >= matchlen) {
+ *matched = matchlen;
+ return !strncmp(base, match, matchlen);
+ }
+
+ dirlen = matchlen;
+ while (dirlen && match[dirlen - 1] != '/')
+ dirlen--;
+
+ /* Return early if base is shorter than the
+ non-wildcard part but it does not match. Note that
+ base ends with '/' so we are sure it really matches
+ directory */
+ if (strncmp(base, match, baselen))
+ return 0;
+ *matched = baselen;
+ } else
+ *matched = 0;
+ /*
+ * we could have checked entry against the non-wildcard part
+ * that is not in base and does similar never_interesting
+ * optimization as in match_entry. For now just be happy with
+ * base comparison.
+ */
+ return entry_interesting;
+}
+
+/*
* Is a tree entry interesting given the pathspec we have?
*
* Pre-condition: either baselen == base_offset (i.e. empty path)
@@ -602,7 +648,7 @@ enum interesting tree_entry_interesting(const struct name_entry *entry,
const struct pathspec_item *item = ps->items+i;
const char *match = item->match;
const char *base_str = base->buf + base_offset;
- int matchlen = item->len;
+ int matchlen = item->len, matched = 0;
if (baselen >= matchlen) {
/* If it doesn't match, move along... */
@@ -647,9 +693,24 @@ match_wildcards:
if (item->nowildcard_len == item->len)
continue;
+ if (item->nowildcard_len &&
+ !match_wildcard_base(item, base_str, baselen, &matched))
+ return entry_not_interesting;
+
/*
* Concatenate base and entry->path into one and do
* fnmatch() on it.
+ *
+ * While we could avoid concatenation in certain cases
+ * [1], which saves a memcpy and potentially a
+ * realloc, it turns out not worth it. Measurement on
+ * linux-2.6 does not show any clear improvements,
+ * partly because of the nowildcard_len optimization
+ * in git_fnmatch(). Avoid micro-optimizations here.
+ *
+ * [1] if match_wildcard_base() says the base
+ * directory is already matched, we only need to match
+ * the rest, which is shorter so _in theory_ faster.
*/
strbuf_add(base, entry->path, pathlen);
--
1.8.0.rc2.23.g1fb49df
^ permalink raw reply related [flat|nested] 11+ messages in thread