From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org
Cc: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>,
"Junio C Hamano" <gitster@pobox.com>
Subject: [PATCH v3 10/13] read_directory: avoid invoking exclude machinery on tracked files
Date: Tue, 12 Mar 2013 20:04:57 +0700 [thread overview]
Message-ID: <1363093500-16796-11-git-send-email-pclouds@gmail.com> (raw)
In-Reply-To: <1363093500-16796-1-git-send-email-pclouds@gmail.com>
read_directory() (and its friendly wrapper fill_directory) collects
untracked/ignored files by traversing through the whole worktree,
feeding every entry to treat_one_path(), where each entry is checked
against .gitignore patterns.
One may see that tracked files can't be excluded and we do not need to
run them through exclude machinery. On repos where there are many
.gitignore patterns and/or a lot of tracked files, this unnecessary
processing can become expensive.
This patch avoids it mostly for normal cases. Directories are still
processed as before. DIR_SHOW_IGNORED and DIR_COLLECT_IGNORED are not
normally used unless some options are given (e.g. "checkout
--overwrite-ignore", "add -f"...)
treat_one_path's behavior changes when taking this shortcut. With
current code, when a non-directory path is not excluded,
treat_one_path calls treat_file, which returns the initial value of
exclude_file and causes treat_one_path to return path_handled. With
this patch, on the same conditions, treat_one_path returns
path_ignored.
read_directory_recursive() cares about this difference. Check out the
snippet:
while (...) {
switch (treat_path(...)) {
case path_ignored:
continue;
case path_handled:
break;
}
contents++;
if (check_only)
break;
dir_add_name(dir, path.buf, path.len);
}
If path_handled is returned, contents goes up. And if check_only is
true, the loop could be broken early. These will not happen when
treat_one_path (and its wrapper treat_path) returns
path_ignored. dir_add_name internally does a cache_name_exists() check
so it makes no difference.
To avoid this behavior change, treat_one_path is instructed to skip
the optimization when check_only or contents is used.
Finally some numbers (best of 20 runs) that shows why it's worth all
the hassle:
git status | webkit linux-2.6 libreoffice-core gentoo-x86
-------------+----------------------------------------------
before | 1.097s 0.208s 0.399s 0.539s
after | 0.736s 0.159s 0.248s 0.501s
nr. patterns | 89 376 19 0
nr. tracked | 182k 40k 63k 101k
treat_leading_path: 0.000 0.000
read_directory: 2.879 1.299
+treat_one_path: 1.620 0.599
++is_excluded: 1.416 0.103
+++prep_exclude: 0.198 0.040
+++matching: 0.904 0.036
++dir_exist: 0.035 0.036
++index_name_exists: 0.289 0.291
lazy_init_name_hash: 0.257 0.257
+simplify_away: 0.085 0.082
+dir_add_name: 0.446 0.000
Tracked-down-by: Karsten Blees <karsten.blees@gmail.com>
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
dir.c | 80 ++++++++++++++++++++++++++++++++++++++++++++-----------------------
1 file changed, 53 insertions(+), 27 deletions(-)
diff --git a/dir.c b/dir.c
index c57bf06..6809dd2 100644
--- a/dir.c
+++ b/dir.c
@@ -43,8 +43,11 @@ struct path_simplify {
const char *path;
};
-static int read_directory_recursive(struct dir_struct *dir, const char *path, int len,
- int check_only, const struct path_simplify *simplify);
+static void read_directory_recursive(struct dir_struct *dir,
+ const char *path, int len,
+ int check_only,
+ const struct path_simplify *simplify,
+ int *contents);
static int get_dtype(struct dirent *de, const char *path, int len);
static inline int memequal_icase(const char *a, const char *b, int n)
@@ -1184,7 +1187,7 @@ static enum directory_treatment treat_directory(struct dir_struct *dir,
const char *dirname, int len, int exclude,
const struct path_simplify *simplify)
{
- int ret;
+ int contents = 0, ret;
START_CLOCK();
/* The "len-1" is to strip the final '/' */
ret = directory_exists_in_index(dirname, len-1);
@@ -1219,19 +1222,19 @@ static enum directory_treatment treat_directory(struct dir_struct *dir,
* check if it contains only ignored files
*/
if ((dir->flags & DIR_SHOW_IGNORED) && !exclude) {
- int ignored;
dir->flags &= ~DIR_SHOW_IGNORED;
dir->flags |= DIR_HIDE_EMPTY_DIRECTORIES;
- ignored = read_directory_recursive(dir, dirname, len, 1, simplify);
+ read_directory_recursive(dir, dirname, len, 1, simplify, &contents);
dir->flags &= ~DIR_HIDE_EMPTY_DIRECTORIES;
dir->flags |= DIR_SHOW_IGNORED;
- return ignored ? ignore_directory : show_directory;
+ return contents ? ignore_directory : show_directory;
}
if (!(dir->flags & DIR_SHOW_IGNORED) &&
!(dir->flags & DIR_HIDE_EMPTY_DIRECTORIES))
return show_directory;
- if (!read_directory_recursive(dir, dirname, len, 1, simplify))
+ read_directory_recursive(dir, dirname, len, 1, simplify, &contents);
+ if (!contents)
return ignore_directory;
return show_directory;
}
@@ -1398,10 +1401,26 @@ enum path_treatment {
static enum path_treatment treat_one_path(struct dir_struct *dir,
struct strbuf *path,
const struct path_simplify *simplify,
- int dtype, struct dirent *de)
+ int dtype, struct dirent *de,
+ int exclude_shortcut_ok)
{
int exclude;
+ if (dtype == DT_UNKNOWN)
+ dtype = get_dtype(de, path->buf, path->len);
+
+ if (exclude_shortcut_ok &&
+ !(dir->flags & DIR_SHOW_IGNORED) &&
+ !(dir->flags & DIR_COLLECT_IGNORED) &&
+ dtype != DT_DIR) {
+ struct cache_entry *ce;
+ START_CLOCK();
+ ce = cache_name_exists(path->buf, path->len, ignore_case);
+ STOP_CLOCK(tv_index_name_exists);
+ if (ce)
+ return path_ignored;
+ }
+
START_CLOCK();
exclude = is_excluded(dir, path->buf, path->len, &dtype);
STOP_CLOCK(tv_is_excluded);
@@ -1417,9 +1436,6 @@ static enum path_treatment treat_one_path(struct dir_struct *dir,
if (exclude && !(dir->flags & DIR_SHOW_IGNORED))
return path_ignored;
- if (dtype == DT_UNKNOWN)
- dtype = get_dtype(de, path->buf, path->len);
-
switch (dtype) {
default:
return path_ignored;
@@ -1451,7 +1467,8 @@ static enum path_treatment treat_path(struct dir_struct *dir,
struct dirent *de,
struct strbuf *path,
int baselen,
- const struct path_simplify *simplify)
+ const struct path_simplify *simplify,
+ int exclude_shortcut_ok)
{
int dtype, ret;
@@ -1467,7 +1484,7 @@ static enum path_treatment treat_path(struct dir_struct *dir,
dtype = DTYPE(de);
START_CLOCK();
- ret = treat_one_path(dir, path, simplify, dtype, de);
+ ret = treat_one_path(dir, path, simplify, dtype, de, exclude_shortcut_ok);
STOP_CLOCK(tv_treat_one_path);
return ret;
}
@@ -1481,13 +1498,13 @@ static enum path_treatment treat_path(struct dir_struct *dir,
* Also, we ignore the name ".git" (even if it is not a directory).
* That likely will not change.
*/
-static int read_directory_recursive(struct dir_struct *dir,
- const char *base, int baselen,
- int check_only,
- const struct path_simplify *simplify)
+static void read_directory_recursive(struct dir_struct *dir,
+ const char *base, int baselen,
+ int check_only,
+ const struct path_simplify *simplify,
+ int *contents)
{
DIR *fdir;
- int contents = 0;
struct dirent *de;
struct strbuf path = STRBUF_INIT;
@@ -1499,18 +1516,29 @@ static int read_directory_recursive(struct dir_struct *dir,
dir->exclude_prepared = 0;
while ((de = readdir(fdir)) != NULL) {
- switch (treat_path(dir, de, &path, baselen, simplify)) {
+ switch (treat_path(dir, de, &path, baselen,
+ simplify,
+ !check_only && !contents)) {
case path_recurse:
- contents += read_directory_recursive(dir, path.buf,
- path.len, 0,
- simplify);
+ read_directory_recursive(dir, path.buf,
+ path.len, 0,
+ simplify,
+ contents);
continue;
case path_ignored:
continue;
case path_handled:
break;
}
- contents++;
+ /*
+ * Update the last argument to treat_path if anything
+ * else is done after this point. This is because if
+ * treat_path's exclude_shortcut_ok is true, it may
+ * incorrectly return path_ignored (and never reaches
+ * this part) instead of path_handled.
+ */
+ if (contents)
+ (*contents)++;
if (check_only)
break;
START_CLOCK();
@@ -1521,8 +1549,6 @@ static int read_directory_recursive(struct dir_struct *dir,
out:
dir->exclude_prepared = 0;
strbuf_release(&path);
-
- return contents;
}
static int cmp_name(const void *p1, const void *p2)
@@ -1593,7 +1619,7 @@ static int treat_leading_path(struct dir_struct *dir,
break;
dir->exclude_prepared = 0;
if (treat_one_path(dir, &sb, simplify,
- DT_DIR, NULL) == path_ignored)
+ DT_DIR, NULL, 0) == path_ignored)
break; /* do not recurse into it */
if (len <= baselen) {
rc = 1;
@@ -1621,7 +1647,7 @@ int read_directory(struct dir_struct *dir, const char *path, int len, const char
STOP_CLOCK(tv_lazy_init_name_hash);
#endif
START_CLOCK();
- read_directory_recursive(dir, path, len, 0, simplify);
+ read_directory_recursive(dir, path, len, 0, simplify, NULL);
STOP_CLOCK(tv_read_directory);
}
#ifdef MEASURE_EXCLUDE
--
1.8.1.2.536.gf441e6d
next prev parent reply other threads:[~2013-03-12 13:07 UTC|newest]
Thread overview: 48+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-03-09 4:09 [PATCH 0/3] Trivial (and small) exclude optimizations Nguyễn Thái Ngọc Duy
2013-03-09 4:09 ` [PATCH 1/3] match_pathname: avoid calling strncmp if baselen is 0 Nguyễn Thái Ngọc Duy
2013-03-09 9:06 ` Antoine Pelisse
2013-03-09 4:09 ` [PATCH 2/3] dir.c: inline convenient *_icase helpers Nguyễn Thái Ngọc Duy
2013-03-09 4:09 ` [PATCH 3/3] match_basename: use strncmp instead of strcmp Nguyễn Thái Ngọc Duy
2013-03-09 7:50 ` Junio C Hamano
2013-03-09 8:47 ` Fredrik Gustafsson
2013-03-09 9:58 ` Duy Nguyen
2013-03-10 6:14 ` [PATCH v2 0/6] Exclude optimizations Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 1/6] match_pathname: avoid calling strncmp if baselen is 0 Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 2/6] dir.c: inline convenient *_icase helpers Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 3/6] match_basename: use strncmp instead of strcmp Nguyễn Thái Ngọc Duy
2013-03-10 7:34 ` Junio C Hamano
2013-03-10 10:38 ` Duy Nguyen
2013-03-10 11:43 ` Antoine Pelisse
2013-03-10 11:54 ` Antoine Pelisse
2013-03-10 12:06 ` Duy Nguyen
2013-03-10 12:11 ` Antoine Pelisse
2013-03-10 12:14 ` Duy Nguyen
2013-03-12 20:59 ` Junio C Hamano
2013-03-13 1:11 ` Duy Nguyen
2013-03-10 6:14 ` [PATCH v2 4/6] match_{base,path}name: replace strncmp_icase with strnequal_icase Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 5/6] dir.c: pass pathname length to last_exclude_matching Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 6/6] exclude: filter patterns by directory level Nguyễn Thái Ngọc Duy
2013-03-10 8:20 ` Junio C Hamano
2013-03-10 10:18 ` Duy Nguyen
2013-03-10 10:58 ` Junio C Hamano
2013-03-10 11:14 ` Duy Nguyen
2013-03-11 15:11 ` [PATCH v2 0/6] Exclude optimizations Duy Nguyen
2013-03-12 13:04 ` [PATCH v3 00/13] " Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 01/13] dir.c: add MEASURE_EXCLUDE code for tracking exclude performance Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 02/13] match_pathname: avoid calling strncmp if baselen is 0 Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 03/13] dir.c: inline convenient *_icase helpers Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 04/13] match_basename: use strncmp instead of strcmp Nguyễn Thái Ngọc Duy
2013-03-12 17:40 ` Antoine Pelisse
2013-03-13 1:05 ` Duy Nguyen
2013-03-12 13:04 ` [PATCH v3 05/13] match_{base,path}name: replace strncmp_icase with memequal_icase Nguyễn Thái Ngọc Duy
2013-03-13 1:14 ` Duy Nguyen
2013-03-12 13:04 ` [PATCH v3 06/13] dir: pass pathname length to last_exclude_matching Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 07/13] exclude: avoid calling prep_exclude on entries of the same directory Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 08/13] exclude: record baselen in the pattern Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 09/13] exclude: filter out patterns not applicable to the current directory Nguyễn Thái Ngọc Duy
2013-03-12 23:13 ` Eric Sunshine
2013-03-12 13:04 ` Nguyễn Thái Ngọc Duy [this message]
2013-03-12 13:04 ` [PATCH v3 11/13] Preallocate hash tables when the number of inserts are known in advance Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 12/13] name-hash: allow to lookup a name with precalculated base hash Nguyễn Thái Ngọc Duy
2013-03-12 13:05 ` [PATCH v3 13/13] read_directory: calculate name hashes incrementally Nguyễn Thái Ngọc Duy
2013-03-14 13:05 ` [PATCH v3 00/13] Exclude optimizations Duy Nguyen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1363093500-16796-11-git-send-email-pclouds@gmail.com \
--to=pclouds@gmail.com \
--cc=git@vger.kernel.org \
--cc=gitster@pobox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).