All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ben Peart <Ben.Peart@microsoft.com>
To: "git@vger.kernel.org" <git@vger.kernel.org>,
	"gitster@pobox.com" <gitster@pobox.com>
Cc: "pclouds@gmail.com" <pclouds@gmail.com>,
	"alexmv@dropbox.com" <alexmv@dropbox.com>,
	"blees@dcon.de" <blees@dcon.de>,
	"bmwill@google.com" <bmwill@google.com>,
	"avarab@gmail.com" <avarab@gmail.com>,
	"johannes.schindelin@gmx.de" <johannes.schindelin@gmx.de>,
	"martin.agren@gmail.com" <martin.agren@gmail.com>,
	Ben Peart <Ben.Peart@microsoft.com>
Subject: [PATCH v3 1/2] fsexcludes: add a programmatic way to exclude files from git's working directory traversal logic
Date: Fri, 13 Apr 2018 12:22:52 +0000	[thread overview]
Message-ID: <20180413122218.1756-2-benpeart@microsoft.com> (raw)
In-Reply-To: <20180413122218.1756-1-benpeart@microsoft.com>

The File System Excludes module is a new programmatic way to exclude files and
folders from git's traversal of the working directory.  fsexcludes_init() should
be called with a string buffer that contains a NUL separated list of path names
of the files and/or directories that should be included.  Any path not listed
will be excluded. The paths should be relative to the root of the working
directory and be separated by a single NUL.

The excludes logic in dir.c has been updated to honor the results of
fsexcludes_is_excluded_from().  If fsexcludes does not exclude the file, the
normal excludes logic is also checked as it could further reduce the set of
files that should be included.

Signed-off-by: Ben Peart <benpeart@microsoft.com>
---
 Makefile     |   1 +
 dir.c        |  24 +++++-
 fsexcludes.c | 211 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fsexcludes.h |  29 +++++++
 4 files changed, 263 insertions(+), 2 deletions(-)
 create mode 100644 fsexcludes.c
 create mode 100644 fsexcludes.h

diff --git a/Makefile b/Makefile
index f181687250..a4f1471272 100644
--- a/Makefile
+++ b/Makefile
@@ -822,6 +822,7 @@ LIB_OBJS += exec_cmd.o
 LIB_OBJS += fetch-object.o
 LIB_OBJS += fetch-pack.o
 LIB_OBJS += fsck.o
+LIB_OBJS += fsexcludes.o
 LIB_OBJS += fsmonitor.o
 LIB_OBJS += gettext.o
 LIB_OBJS += gpg-interface.o
diff --git a/dir.c b/dir.c
index 63a917be45..47a073efe1 100644
--- a/dir.c
+++ b/dir.c
@@ -18,6 +18,7 @@
 #include "utf8.h"
 #include "varint.h"
 #include "ewah/ewok.h"
+#include "fsexcludes.h"
 #include "fsmonitor.h"
 
 /*
@@ -1102,6 +1103,12 @@ int is_excluded_from_list(const char *pathname,
 			  struct exclude_list *el, struct index_state *istate)
 {
 	struct exclude *exclude;
+
+	if (*dtype == DT_UNKNOWN)
+		*dtype = get_dtype(NULL, istate, pathname, pathlen);
+	if (fsexcludes_is_excluded_from(istate, pathname, pathlen, *dtype) > 0)
+		return 1;
+
 	exclude = last_exclude_matching_from_list(pathname, pathlen, basename,
 						  dtype, el, istate);
 	if (exclude)
@@ -1317,8 +1324,15 @@ struct exclude *last_exclude_matching(struct dir_struct *dir,
 int is_excluded(struct dir_struct *dir, struct index_state *istate,
 		const char *pathname, int *dtype_p)
 {
-	struct exclude *exclude =
-		last_exclude_matching(dir, istate, pathname, dtype_p);
+	struct exclude *exclude;
+	int pathlen = strlen(pathname);
+
+	if (*dtype_p == DT_UNKNOWN)
+		*dtype_p = get_dtype(NULL, istate, pathname, pathlen);
+	if (fsexcludes_is_excluded_from(istate, pathname, pathlen, *dtype_p) > 0)
+		return 1;
+
+	exclude = last_exclude_matching(dir, istate, pathname, dtype_p);
 	if (exclude)
 		return exclude->flags & EXC_FLAG_NEGATIVE ? 0 : 1;
 	return 0;
@@ -1671,6 +1685,9 @@ static enum path_treatment treat_one_path(struct dir_struct *dir,
 	if (dtype != DT_DIR && has_path_in_index)
 		return path_none;
 
+	if (fsexcludes_is_excluded_from(istate, path->buf, path->len, dtype) > 0)
+		return path_excluded;
+
 	/*
 	 * When we are looking at a directory P in the working tree,
 	 * there are three cases:
@@ -2011,6 +2028,9 @@ static enum path_treatment read_directory_recursive(struct dir_struct *dir,
 		/* add the path to the appropriate result list */
 		switch (state) {
 		case path_excluded:
+			if (fsexcludes_is_excluded_from(istate, path.buf, path.len,
+					get_dtype(cdir.de, istate, path.buf, path.len)) > 0)
+				break;
 			if (dir->flags & DIR_SHOW_IGNORED)
 				dir_add_name(dir, istate, path.buf, path.len);
 			else if ((dir->flags & DIR_SHOW_IGNORED_TOO) ||
diff --git a/fsexcludes.c b/fsexcludes.c
new file mode 100644
index 0000000000..0ef57f107b
--- /dev/null
+++ b/fsexcludes.c
@@ -0,0 +1,211 @@
+#include "cache.h"
+#include "fsexcludes.h"
+#include "hashmap.h"
+#include "strbuf.h"
+
+static int fsexcludes_initialized = 0;
+static struct strbuf fsexcludes_data = STRBUF_INIT;
+static struct hashmap fsexcludes_hashmap;
+static struct hashmap parent_directory_hashmap;
+
+struct fsexcludes {
+	struct hashmap_entry ent; /* must be the first member! */
+	const char *pattern;
+	int patternlen;
+};
+
+static unsigned int(*fsexcludeshash)(const void *buf, size_t len);
+static int(*fsexcludescmp)(const char *a, const char *b, size_t len);
+
+static int fsexcludes_hashmap_cmp(const void *unused_cmp_data,
+	const void *a, const void *b, const void *key)
+{
+	const struct fsexcludes *fse1 = a;
+	const struct fsexcludes *fse2 = b;
+
+	return fsexcludescmp(fse1->pattern, fse2->pattern, fse1->patternlen);
+}
+
+static int check_fsexcludes_hashmap(struct hashmap *map, const char *pattern, int patternlen)
+{
+	struct strbuf sb = STRBUF_INIT;
+	struct fsexcludes fse;
+	char *slash;
+
+	/* Check straight mapping */
+	strbuf_add(&sb, pattern, patternlen);
+	fse.pattern = sb.buf;
+	fse.patternlen = sb.len;
+	hashmap_entry_init(&fse, fsexcludeshash(fse.pattern, fse.patternlen));
+	if (hashmap_get(map, &fse, NULL)) {
+		strbuf_release(&sb);
+		return 0;
+	}
+
+	/*
+	 * Check to see if it matches a directory or any path
+	 * underneath it.  In other words, 'a/b/foo.txt' will match
+	 * '/', 'a/', and 'a/b/'.
+	 */
+	slash = strchr(sb.buf, '/');
+	while (slash) {
+		fse.pattern = sb.buf;
+		fse.patternlen = slash - sb.buf + 1;
+		hashmap_entry_init(&fse, fsexcludeshash(fse.pattern, fse.patternlen));
+		if (hashmap_get(map, &fse, NULL)) {
+			strbuf_release(&sb);
+			return 0;
+		}
+		slash = strchr(slash + 1, '/');
+	}
+
+	strbuf_release(&sb);
+	return 1;
+}
+
+static void fsexcludes_hashmap_add(struct hashmap *map, const char *pattern, const int patternlen)
+{
+	struct fsexcludes *fse;
+
+	fse = xmalloc(sizeof(struct fsexcludes));
+	fse->pattern = pattern;
+	fse->patternlen = patternlen;
+	hashmap_entry_init(fse, fsexcludeshash(fse->pattern, fse->patternlen));
+	hashmap_add(map, fse);
+}
+
+static void initialize_fsexcludes_hashmap(struct hashmap *map, struct strbuf *fsexcludes_data)
+{
+	char *buf, *entry;
+	size_t len;
+	int i;
+
+	/*
+	 * Build a hashmap of the fsexcludes data we can use to look
+	 * for cache entry matches quickly
+	 */
+	fsexcludeshash = ignore_case ? memihash : memhash;
+	fsexcludescmp = ignore_case ? strncasecmp : strncmp;
+	hashmap_init(map, fsexcludes_hashmap_cmp, NULL, 0);
+
+	entry = buf = fsexcludes_data->buf;
+	len = fsexcludes_data->len;
+	for (i = 0; i < len; i++) {
+		if (buf[i] == '\0') {
+			fsexcludes_hashmap_add(map, entry, buf + i - entry);
+			entry = buf + i + 1;
+		}
+	}
+}
+
+static void parent_directory_hashmap_add(struct hashmap *map, const char *pattern, const int patternlen)
+{
+	char *slash;
+	struct fsexcludes *fse;
+
+	/*
+	 * Add any directories leading up to the file as the excludes logic
+	 * needs to match directories leading up to the files as well. Detect
+	 * and prevent unnecessary duplicate entries which will be common.
+	 */
+	if (patternlen > 1) {
+		slash = strchr(pattern + 1, '/');
+		while (slash) {
+			fse = xmalloc(sizeof(struct fsexcludes));
+			fse->pattern = pattern;
+			fse->patternlen = slash - pattern + 1;
+			hashmap_entry_init(fse, fsexcludeshash(fse->pattern, fse->patternlen));
+			if (hashmap_get(map, fse, NULL))
+				free(fse);
+			else
+				hashmap_add(map, fse);
+			slash = strchr(slash + 1, '/');
+		}
+	}
+}
+
+static void initialize_parent_directory_hashmap(struct hashmap *map, struct strbuf *vfs_data)
+{
+	char *buf, *entry;
+	size_t len;
+	int i;
+
+	/*
+	 * Build a hashmap of the parent directories contained in the virtual
+	 * file system data we can use to look for matches quickly
+	 */
+	fsexcludeshash = ignore_case ? memihash : memhash;
+	fsexcludescmp = ignore_case ? strncasecmp : strncmp;
+	hashmap_init(map, fsexcludes_hashmap_cmp, NULL, 0);
+
+	entry = buf = vfs_data->buf;
+	len = vfs_data->len;
+	for (i = 0; i < len; i++) {
+		if (buf[i] == '\0') {
+			parent_directory_hashmap_add(map, entry, buf + i - entry);
+			entry = buf + i + 1;
+		}
+	}
+}
+
+static int check_directory_hashmap(struct hashmap *map, const char *pathname, int pathlen)
+{
+	struct strbuf sb = STRBUF_INIT;
+	struct fsexcludes fse;
+
+	/* Check for directory */
+	strbuf_add(&sb, pathname, pathlen);
+	strbuf_addch(&sb, '/');
+	fse.pattern = sb.buf;
+	fse.patternlen = sb.len;
+	hashmap_entry_init(&fse, fsexcludeshash(fse.pattern, fse.patternlen));
+	if (hashmap_get(map, &fse, NULL)) {
+		strbuf_release(&sb);
+		return 0;
+	}
+
+	strbuf_release(&sb);
+	return 1;
+}
+
+/*
+ * Return 1 for exclude, 0 for include and -1 for undecided.
+ */
+int fsexcludes_is_excluded_from(struct index_state *istate,
+	const char *pathname, int pathlen, int dtype)
+{
+	if (!fsexcludes_initialized)
+		return -1;
+
+	if (dtype == DT_REG) {
+		/* lazily init the hashmap */
+		if (!fsexcludes_hashmap.cmpfn_data)
+			initialize_fsexcludes_hashmap(&fsexcludes_hashmap, &fsexcludes_data);
+
+		return check_fsexcludes_hashmap(&fsexcludes_hashmap, pathname, pathlen);
+	}
+
+	if (dtype == DT_DIR || dtype == DT_LNK) {
+		/* lazily init the hashmap */
+		if (!parent_directory_hashmap.cmpfn_data)
+			initialize_parent_directory_hashmap(&parent_directory_hashmap, &fsexcludes_data);
+
+		return check_directory_hashmap(&parent_directory_hashmap, pathname, pathlen);
+	}
+
+	return -1;
+}
+
+void fsexcludes_init(struct strbuf *sb)
+{
+	fsexcludes_initialized = 1;
+	fsexcludes_data = *sb;
+	strbuf_detach(sb, NULL);
+}
+
+void fsexcludes_free(void) {
+	strbuf_release(&fsexcludes_data);
+	hashmap_free(&fsexcludes_hashmap, 1);
+	hashmap_free(&parent_directory_hashmap, 1);
+	fsexcludes_initialized = 0;
+}
diff --git a/fsexcludes.h b/fsexcludes.h
new file mode 100644
index 0000000000..10246daa02
--- /dev/null
+++ b/fsexcludes.h
@@ -0,0 +1,29 @@
+#ifndef FSEXCLUDES_H
+#define FSEXCLUDES_H
+
+/*
+ * The file system excludes functions provides a way to programatically limit
+ * where git will scan for untracked files.  This is used to speed up the
+ * scan by avoiding scanning parts of the work directory that do not have
+ * any new files.
+ */
+
+/*
+ * sb should contain a NUL separated list of path names of the files
+ * and/or directories that should be checked.  Any path not listed will
+ * be excluded from the scan.
+ *
+ * NOTE: fsexcludes_init() will take ownership of the storage passed in
+ * sb and will reset sb to `STRBUF_INIT`
+ */
+void fsexcludes_init(struct strbuf *sb);
+void fsexcludes_free(void);
+
+/*
+ * Return 1 for exclude, 0 for include and -1 for undecided.
+ */
+int fsexcludes_is_excluded_from(struct index_state *istate,
+	const char *pathname, int pathlen, int dtype_p);
+
+
+#endif
-- 
2.17.0.windows.1


  reply	other threads:[~2018-04-13 12:22 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-04-10 21:04 [PATCH v1 0/2] fsexcludes: Add programmatic way to exclude files Ben Peart
2018-04-10 21:04 ` [PATCH v1 1/2] fsexcludes: add a programmatic way to exclude files from git's working directory traversal logic Ben Peart
2018-04-10 22:09   ` Martin Ågren
2018-04-11 19:56     ` Ben Peart
2018-04-11  6:58   ` Junio C Hamano
2018-04-10 21:04 ` [PATCH v1 2/2] fsmonitor: switch to use new fsexcludes logic and remove unused untracked cache based logic Ben Peart
2018-04-11 20:01 ` [PATCH v2 0/2] fsexcludes: Add programmatic way to exclude files Ben Peart
2018-04-11 20:01   ` [PATCH v2 1/2] fsexcludes: add a programmatic way to exclude files from git's working directory traversal logic Ben Peart
2018-04-11 23:52     ` Junio C Hamano
2018-04-13 11:53       ` Ben Peart
2018-04-11 20:01   ` [PATCH v2 2/2] fsmonitor: switch to use new fsexcludes logic and remove unused untracked cache based logic Ben Peart
2018-04-13 12:22 ` [PATCH v3 0/2] fsexcludes: Add programmatic way to exclude files Ben Peart
2018-04-13 12:22   ` Ben Peart [this message]
2018-04-13 12:22   ` [PATCH v3 2/2] fsmonitor: switch to use new fsexcludes logic and remove unused untracked cache based logic Ben Peart
2018-04-18 15:31   ` [PATCH v3 0/2] fsexcludes: Add programmatic way to exclude files Ben Peart
2018-04-18 21:25     ` Junio C Hamano
2018-04-14 15:59 ` [PATCH v1 " Duy Nguyen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180413122218.1756-2-benpeart@microsoft.com \
    --to=ben.peart@microsoft.com \
    --cc=alexmv@dropbox.com \
    --cc=avarab@gmail.com \
    --cc=blees@dcon.de \
    --cc=bmwill@google.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=johannes.schindelin@gmx.de \
    --cc=martin.agren@gmail.com \
    --cc=pclouds@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.