From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org
Cc: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Subject: [PATCH v3 09/13] exclude: filter out patterns not applicable to the current directory
Date: Tue, 12 Mar 2013 20:04:56 +0700 [thread overview]
Message-ID: <1363093500-16796-10-git-send-email-pclouds@gmail.com> (raw)
In-Reply-To: <1363093500-16796-1-git-send-email-pclouds@gmail.com>
.gitignore files are spread over directories (*) so that when we check
for ignored files at foo, we are not bothered by foo/bar/.gitignore,
which contains ignore rules for foo/bar only.
This is not enough. foo/.gitignore can contain the pattern
"foo/bar/*.c". When we stay at foo, we know that the pattern cannot
match anything. Similarly, the pattern "/autom4te.cache" at root
directory cannot match anything in foo. This patch attempts to filter
out such patterns to drive down matching cost.
The algorithm implemented here is a naive one. Patterns can be either
active or passive:
- When we enter a new directory (e.g. from root to foo), currently
active patterns may no longer be applicable and can be turned to
passive.
- On the opposite, when we leave a directory (foo back to roo),
passive patterns may come alive again.
We could do smarter things. But this implementation cuts a big portion
of cost already (and solves the "root .gitignore is evil" problem).
There's probably no need to be smart.
(*) this design forces us to try to find .gitignore at every
directory. On webkit.git that equals to 6k open syscalls. It feels
like ".svn on every directory" again. I suggest we add
~/.gitignore.master, containing the list .gitignore files in
worktree. If this file exists, we don't poke at every directory for
.gitignore.
treat_leading_path: 0.000 0.000
read_directory: 3.455 2.879
+treat_one_path: 2.203 1.620
++is_excluded: 2.000 1.416
+++prep_exclude: 0.171 0.198
+++matching: 1.509 0.904
++dir_exist: 0.036 0.035
++index_name_exists: 0.292 0.289
lazy_init_name_hash: 0.257 0.257
+simplify_away: 0.084 0.085
+dir_add_name: 0.446 0.446
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
dir.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
dir.h | 1 +
2 files changed, 92 insertions(+), 2 deletions(-)
diff --git a/dir.c b/dir.c
index 932fd2f..c57bf06 100644
--- a/dir.c
+++ b/dir.c
@@ -458,7 +458,7 @@ void add_exclude(const char *string, const char *base,
x->base = base;
x->baselen = baselen;
x->pattern_baselen = pattern_baselen;
- x->flags = flags;
+ x->flags = flags | EXC_FLAG_ACTIVE;
x->srcpos = srcpos;
ALLOC_GROW(el->excludes, el->nr + 1, el->alloc);
el->excludes[el->nr++] = x;
@@ -591,6 +591,87 @@ void add_excludes_from_file(struct dir_struct *dir, const char *fname)
die("cannot use %s as an exclude file", fname);
}
+static int pattern_match_base(struct dir_struct *dir,
+ const char *base, int baselen,
+ const struct exclude *exc)
+{
+ const char *pattern;
+
+ /*
+ * TODO: if a patterns come from a .gitignore, exc->base would
+ * be the same for all of them. We could compare once and
+ * reuse the result, instead of perform the comparison per
+ * pattern like this.
+ */
+ if (exc->baselen) {
+ if (baselen < exc->baselen + 1)
+ return 0;
+
+ if (base[exc->baselen] != '/' ||
+ memcmp(base, exc->base, exc->baselen))
+ return 0;
+
+ base += exc->baselen + 1;
+ baselen -= exc->baselen + 1;
+ }
+
+ if (baselen != exc->pattern_baselen)
+ return 0;
+
+ if (exc->pattern_baselen) {
+ pattern = exc->pattern;
+ if (*pattern == '/')
+ pattern++;
+ if (memcmp(base, pattern, exc->pattern_baselen))
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * If pushed is non-zero, we have entered a new directory. Some
+ * pathname patterns may no longer applicable. Go over all active
+ * patterns and disable them if so.
+ *
+ * If popped is non-zero, we have left a directory. Inactive patterns
+ * may be applicable again. Go over them and re-enable if so.
+ */
+static void scan_patterns(struct dir_struct *dir,
+ const char *base, int baselen,
+ int pushed, int popped)
+{
+ int i, j, k;
+
+ for (i = EXC_CMDL; i <= EXC_FILE; i++) {
+ struct exclude_list_group *group = &dir->exclude_list_group[i];
+ for (j = group->nr - 1; j >= 0; j--) {
+ struct exclude_list *list = &group->el[j];
+ for (k = 0; k < list->nr; k++) {
+ struct exclude *exc = list->excludes[k];
+
+ /*
+ * No base (i.e. EXC_FLAG_NODIR) or
+ * applicable to many bases ("**"
+ * patterns)
+ */
+ if (exc->pattern_baselen == -1)
+ continue;
+
+ if (exc->flags & EXC_FLAG_ACTIVE) {
+ if (pushed &&
+ !pattern_match_base(dir, base, baselen, exc))
+ exc->flags &= ~EXC_FLAG_ACTIVE;
+ } else {
+ if (popped &&
+ pattern_match_base(dir, base, baselen, exc))
+ exc->flags |= EXC_FLAG_ACTIVE;
+ }
+ }
+ }
+ }
+}
+
/*
* Loads the per-directory exclude list for the substring of base
* which has a char length of baselen.
@@ -600,7 +681,7 @@ static void prep_exclude(struct dir_struct *dir, const char *base, int baselen)
struct exclude_list_group *group;
struct exclude_list *el;
struct exclude_stack *stk = NULL;
- int current;
+ int current, popped = 0, pushed = 0;
if ((!dir->exclude_per_dir) ||
(baselen + strlen(dir->exclude_per_dir) >= PATH_MAX))
@@ -621,6 +702,7 @@ static void prep_exclude(struct dir_struct *dir, const char *base, int baselen)
clear_exclude_list(el);
free(stk);
group->nr--;
+ popped++;
}
/* Read from the parent directories and push them down. */
@@ -659,8 +741,12 @@ static void prep_exclude(struct dir_struct *dir, const char *base, int baselen)
el, 1);
dir->exclude_stack = stk;
current = stk->baselen;
+ pushed++;
}
dir->basebuf[baselen] = '\0';
+
+ if (pushed | popped)
+ scan_patterns(dir, base, baselen, pushed, popped);
}
int match_basename(const char *basename, int basenamelen,
@@ -755,6 +841,9 @@ static struct exclude *last_exclude_matching_from_list(const char *pathname,
const char *exclude = x->pattern;
int prefix = x->nowildcardlen;
+ if (!(x->flags & EXC_FLAG_ACTIVE))
+ continue;
+
if (x->flags & EXC_FLAG_MUSTBEDIR) {
if (*dtype == DT_UNKNOWN)
*dtype = get_dtype(NULL, pathname, pathlen);
diff --git a/dir.h b/dir.h
index cb50a85..247bfda 100644
--- a/dir.h
+++ b/dir.h
@@ -14,6 +14,7 @@ struct dir_entry {
#define EXC_FLAG_ENDSWITH 4
#define EXC_FLAG_MUSTBEDIR 8
#define EXC_FLAG_NEGATIVE 16
+#define EXC_FLAG_ACTIVE 32
/*
* Each excludes file will be parsed into a fresh exclude_list which
--
1.8.1.2.536.gf441e6d
next prev parent reply other threads:[~2013-03-12 13:06 UTC|newest]
Thread overview: 48+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-03-09 4:09 [PATCH 0/3] Trivial (and small) exclude optimizations Nguyễn Thái Ngọc Duy
2013-03-09 4:09 ` [PATCH 1/3] match_pathname: avoid calling strncmp if baselen is 0 Nguyễn Thái Ngọc Duy
2013-03-09 9:06 ` Antoine Pelisse
2013-03-09 4:09 ` [PATCH 2/3] dir.c: inline convenient *_icase helpers Nguyễn Thái Ngọc Duy
2013-03-09 4:09 ` [PATCH 3/3] match_basename: use strncmp instead of strcmp Nguyễn Thái Ngọc Duy
2013-03-09 7:50 ` Junio C Hamano
2013-03-09 8:47 ` Fredrik Gustafsson
2013-03-09 9:58 ` Duy Nguyen
2013-03-10 6:14 ` [PATCH v2 0/6] Exclude optimizations Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 1/6] match_pathname: avoid calling strncmp if baselen is 0 Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 2/6] dir.c: inline convenient *_icase helpers Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 3/6] match_basename: use strncmp instead of strcmp Nguyễn Thái Ngọc Duy
2013-03-10 7:34 ` Junio C Hamano
2013-03-10 10:38 ` Duy Nguyen
2013-03-10 11:43 ` Antoine Pelisse
2013-03-10 11:54 ` Antoine Pelisse
2013-03-10 12:06 ` Duy Nguyen
2013-03-10 12:11 ` Antoine Pelisse
2013-03-10 12:14 ` Duy Nguyen
2013-03-12 20:59 ` Junio C Hamano
2013-03-13 1:11 ` Duy Nguyen
2013-03-10 6:14 ` [PATCH v2 4/6] match_{base,path}name: replace strncmp_icase with strnequal_icase Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 5/6] dir.c: pass pathname length to last_exclude_matching Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 6/6] exclude: filter patterns by directory level Nguyễn Thái Ngọc Duy
2013-03-10 8:20 ` Junio C Hamano
2013-03-10 10:18 ` Duy Nguyen
2013-03-10 10:58 ` Junio C Hamano
2013-03-10 11:14 ` Duy Nguyen
2013-03-11 15:11 ` [PATCH v2 0/6] Exclude optimizations Duy Nguyen
2013-03-12 13:04 ` [PATCH v3 00/13] " Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 01/13] dir.c: add MEASURE_EXCLUDE code for tracking exclude performance Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 02/13] match_pathname: avoid calling strncmp if baselen is 0 Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 03/13] dir.c: inline convenient *_icase helpers Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 04/13] match_basename: use strncmp instead of strcmp Nguyễn Thái Ngọc Duy
2013-03-12 17:40 ` Antoine Pelisse
2013-03-13 1:05 ` Duy Nguyen
2013-03-12 13:04 ` [PATCH v3 05/13] match_{base,path}name: replace strncmp_icase with memequal_icase Nguyễn Thái Ngọc Duy
2013-03-13 1:14 ` Duy Nguyen
2013-03-12 13:04 ` [PATCH v3 06/13] dir: pass pathname length to last_exclude_matching Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 07/13] exclude: avoid calling prep_exclude on entries of the same directory Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 08/13] exclude: record baselen in the pattern Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` Nguyễn Thái Ngọc Duy [this message]
2013-03-12 23:13 ` [PATCH v3 09/13] exclude: filter out patterns not applicable to the current directory Eric Sunshine
2013-03-12 13:04 ` [PATCH v3 10/13] read_directory: avoid invoking exclude machinery on tracked files Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 11/13] Preallocate hash tables when the number of inserts are known in advance Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 12/13] name-hash: allow to lookup a name with precalculated base hash Nguyễn Thái Ngọc Duy
2013-03-12 13:05 ` [PATCH v3 13/13] read_directory: calculate name hashes incrementally Nguyễn Thái Ngọc Duy
2013-03-14 13:05 ` [PATCH v3 00/13] Exclude optimizations Duy Nguyen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1363093500-16796-10-git-send-email-pclouds@gmail.com \
--to=pclouds@gmail.com \
--cc=git@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).