From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org
Cc: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Subject: [PATCH v3 09/13] exclude: filter out patterns not applicable to the current directory
Date: Tue, 12 Mar 2013 20:04:56 +0700 [thread overview]
Message-ID: <1363093500-16796-10-git-send-email-pclouds@gmail.com> (raw)
In-Reply-To: <1363093500-16796-1-git-send-email-pclouds@gmail.com>
.gitignore files are spread over directories (*) so that when we check
for ignored files at foo, we are not bothered by foo/bar/.gitignore,
which contains ignore rules for foo/bar only.
This is not enough. foo/.gitignore can contain the pattern
"foo/bar/*.c". When we stay at foo, we know that the pattern cannot
match anything. Similarly, the pattern "/autom4te.cache" at root
directory cannot match anything in foo. This patch attempts to filter
out such patterns to drive down matching cost.
The algorithm implemented here is a naive one. Patterns can be either
active or passive:
- When we enter a new directory (e.g. from root to foo), currently
active patterns may no longer be applicable and can be turned to
passive.
- On the opposite, when we leave a directory (foo back to roo),
passive patterns may come alive again.
We could do smarter things. But this implementation cuts a big portion
of cost already (and solves the "root .gitignore is evil" problem).
There's probably no need to be smart.
(*) this design forces us to try to find .gitignore at every
directory. On webkit.git that equals to 6k open syscalls. It feels
like ".svn on every directory" again. I suggest we add
~/.gitignore.master, containing the list .gitignore files in
worktree. If this file exists, we don't poke at every directory for
.gitignore.
treat_leading_path: 0.000 0.000
read_directory: 3.455 2.879
+treat_one_path: 2.203 1.620
++is_excluded: 2.000 1.416
+++prep_exclude: 0.171 0.198
+++matching: 1.509 0.904
++dir_exist: 0.036 0.035
++index_name_exists: 0.292 0.289
lazy_init_name_hash: 0.257 0.257
+simplify_away: 0.084 0.085
+dir_add_name: 0.446 0.446
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
dir.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
dir.h | 1 +
2 files changed, 92 insertions(+), 2 deletions(-)
diff --git a/dir.c b/dir.c
index 932fd2f..c57bf06 100644
--- a/dir.c
+++ b/dir.c
@@ -458,7 +458,7 @@ void add_exclude(const char *string, const char *base,
x->base = base;
x->baselen = baselen;
x->pattern_baselen = pattern_baselen;
- x->flags = flags;
+ x->flags = flags | EXC_FLAG_ACTIVE;
x->srcpos = srcpos;
ALLOC_GROW(el->excludes, el->nr + 1, el->alloc);
el->excludes[el->nr++] = x;
@@ -591,6 +591,87 @@ void add_excludes_from_file(struct dir_struct *dir, const char *fname)
die("cannot use %s as an exclude file", fname);
}
+static int pattern_match_base(struct dir_struct *dir,
+ const char *base, int baselen,
+ const struct exclude *exc)
+{
+ const char *pattern;
+
+ /*
+ * TODO: if a patterns come from a .gitignore, exc->base would
+ * be the same for all of them. We could compare once and
+ * reuse the result, instead of perform the comparison per
+ * pattern like this.
+ */
+ if (exc->baselen) {
+ if (baselen < exc->baselen + 1)
+ return 0;
+
+ if (base[exc->baselen] != '/' ||
+ memcmp(base, exc->base, exc->baselen))
+ return 0;
+
+ base += exc->baselen + 1;
+ baselen -= exc->baselen + 1;
+ }
+
+ if (baselen != exc->pattern_baselen)
+ return 0;
+
+ if (exc->pattern_baselen) {
+ pattern = exc->pattern;
+ if (*pattern == '/')
+ pattern++;
+ if (memcmp(base, pattern, exc->pattern_baselen))
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * If pushed is non-zero, we have entered a new directory. Some
+ * pathname patterns may no longer applicable. Go over all active
+ * patterns and disable them if so.
+ *
+ * If popped is non-zero, we have left a directory. Inactive patterns
+ * may be applicable again. Go over them and re-enable if so.
+ */
+static void scan_patterns(struct dir_struct *dir,
+ const char *base, int baselen,
+ int pushed, int popped)
+{
+ int i, j, k;
+
+ for (i = EXC_CMDL; i <= EXC_FILE; i++) {
+ struct exclude_list_group *group = &dir->exclude_list_group[i];
+ for (j = group->nr - 1; j >= 0; j--) {
+ struct exclude_list *list = &group->el[j];
+ for (k = 0; k < list->nr; k++) {
+ struct exclude *exc = list->excludes[k];
+
+ /*
+ * No base (i.e. EXC_FLAG_NODIR) or
+ * applicable to many bases ("**"
+ * patterns)
+ */
+ if (exc->pattern_baselen == -1)
+ continue;
+
+ if (exc->flags & EXC_FLAG_ACTIVE) {
+ if (pushed &&
+ !pattern_match_base(dir, base, baselen, exc))
+ exc->flags &= ~EXC_FLAG_ACTIVE;
+ } else {
+ if (popped &&
+ pattern_match_base(dir, base, baselen, exc))
+ exc->flags |= EXC_FLAG_ACTIVE;
+ }
+ }
+ }
+ }
+}
+
/*
* Loads the per-directory exclude list for the substring of base
* which has a char length of baselen.
@@ -600,7 +681,7 @@ static void prep_exclude(struct dir_struct *dir, const char *base, int baselen)
struct exclude_list_group *group;
struct exclude_list *el;
struct exclude_stack *stk = NULL;
- int current;
+ int current, popped = 0, pushed = 0;
if ((!dir->exclude_per_dir) ||
(baselen + strlen(dir->exclude_per_dir) >= PATH_MAX))
@@ -621,6 +702,7 @@ static void prep_exclude(struct dir_struct *dir, const char *base, int baselen)
clear_exclude_list(el);
free(stk);
group->nr--;
+ popped++;
}
/* Read from the parent directories and push them down. */
@@ -659,8 +741,12 @@ static void prep_exclude(struct dir_struct *dir, const char *base, int baselen)
el, 1);
dir->exclude_stack = stk;
current = stk->baselen;
+ pushed++;
}
dir->basebuf[baselen] = '\0';
+
+ if (pushed | popped)
+ scan_patterns(dir, base, baselen, pushed, popped);
}
int match_basename(const char *basename, int basenamelen,
@@ -755,6 +841,9 @@ static struct exclude *last_exclude_matching_from_list(const char *pathname,
const char *exclude = x->pattern;
int prefix = x->nowildcardlen;
+ if (!(x->flags & EXC_FLAG_ACTIVE))
+ continue;
+
if (x->flags & EXC_FLAG_MUSTBEDIR) {
if (*dtype == DT_UNKNOWN)
*dtype = get_dtype(NULL, pathname, pathlen);
diff --git a/dir.h b/dir.h
index cb50a85..247bfda 100644
--- a/dir.h
+++ b/dir.h
@@ -14,6 +14,7 @@ struct dir_entry {
#define EXC_FLAG_ENDSWITH 4
#define EXC_FLAG_MUSTBEDIR 8
#define EXC_FLAG_NEGATIVE 16
+#define EXC_FLAG_ACTIVE 32
/*
* Each excludes file will be parsed into a fresh exclude_list which
--
1.8.1.2.536.gf441e6d
next prev parent reply other threads:[~2013-03-12 13:06 UTC|newest]
Thread overview: 48+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-03-09 4:09 [PATCH 0/3] Trivial (and small) exclude optimizations Nguyễn Thái Ngọc Duy
2013-03-09 4:09 ` [PATCH 1/3] match_pathname: avoid calling strncmp if baselen is 0 Nguyễn Thái Ngọc Duy
2013-03-09 9:06 ` Antoine Pelisse
2013-03-09 4:09 ` [PATCH 2/3] dir.c: inline convenient *_icase helpers Nguyễn Thái Ngọc Duy
2013-03-09 4:09 ` [PATCH 3/3] match_basename: use strncmp instead of strcmp Nguyễn Thái Ngọc Duy
2013-03-09 7:50 ` Junio C Hamano
2013-03-09 8:47 ` Fredrik Gustafsson
2013-03-09 9:58 ` Duy Nguyen
2013-03-10 6:14 ` [PATCH v2 0/6] Exclude optimizations Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 1/6] match_pathname: avoid calling strncmp if baselen is 0 Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 2/6] dir.c: inline convenient *_icase helpers Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 3/6] match_basename: use strncmp instead of strcmp Nguyễn Thái Ngọc Duy
2013-03-10 7:34 ` Junio C Hamano
2013-03-10 10:38 ` Duy Nguyen
2013-03-10 11:43 ` Antoine Pelisse
2013-03-10 11:54 ` Antoine Pelisse
2013-03-10 12:06 ` Duy Nguyen
2013-03-10 12:11 ` Antoine Pelisse
2013-03-10 12:14 ` Duy Nguyen
2013-03-12 20:59 ` Junio C Hamano
2013-03-13 1:11 ` Duy Nguyen
2013-03-10 6:14 ` [PATCH v2 4/6] match_{base,path}name: replace strncmp_icase with strnequal_icase Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 5/6] dir.c: pass pathname length to last_exclude_matching Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 6/6] exclude: filter patterns by directory level Nguyễn Thái Ngọc Duy
2013-03-10 8:20 ` Junio C Hamano
2013-03-10 10:18 ` Duy Nguyen
2013-03-10 10:58 ` Junio C Hamano
2013-03-10 11:14 ` Duy Nguyen
2013-03-11 15:11 ` [PATCH v2 0/6] Exclude optimizations Duy Nguyen
2013-03-12 13:04 ` [PATCH v3 00/13] " Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 01/13] dir.c: add MEASURE_EXCLUDE code for tracking exclude performance Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 02/13] match_pathname: avoid calling strncmp if baselen is 0 Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 03/13] dir.c: inline convenient *_icase helpers Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 04/13] match_basename: use strncmp instead of strcmp Nguyễn Thái Ngọc Duy
2013-03-12 17:40 ` Antoine Pelisse
2013-03-13 1:05 ` Duy Nguyen
2013-03-12 13:04 ` [PATCH v3 05/13] match_{base,path}name: replace strncmp_icase with memequal_icase Nguyễn Thái Ngọc Duy
2013-03-13 1:14 ` Duy Nguyen
2013-03-12 13:04 ` [PATCH v3 06/13] dir: pass pathname length to last_exclude_matching Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 07/13] exclude: avoid calling prep_exclude on entries of the same directory Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 08/13] exclude: record baselen in the pattern Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` Nguyễn Thái Ngọc Duy [this message]
2013-03-12 23:13 ` [PATCH v3 09/13] exclude: filter out patterns not applicable to the current directory Eric Sunshine
2013-03-12 13:04 ` [PATCH v3 10/13] read_directory: avoid invoking exclude machinery on tracked files Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 11/13] Preallocate hash tables when the number of inserts are known in advance Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 12/13] name-hash: allow to lookup a name with precalculated base hash Nguyễn Thái Ngọc Duy
2013-03-12 13:05 ` [PATCH v3 13/13] read_directory: calculate name hashes incrementally Nguyễn Thái Ngọc Duy
2013-03-14 13:05 ` [PATCH v3 00/13] Exclude optimizations Duy Nguyen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1363093500-16796-10-git-send-email-pclouds@gmail.com \
--to=pclouds@gmail.com \
--cc=git@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.