From: Elijah Newren <newren@gmail.com>
To: git@vger.kernel.org
Cc: Victoria Dye <vdye@github.com>, Derrick Stolee <stolee@gmail.com>,
Lessley Dennington <lessleydennington@gmail.com>,
Elijah Newren <newren@gmail.com>
Subject: [RFC PATCH 5/5] Accelerate ensure_skip_worktree_means_skip_worktree by caching
Date: Sat, 8 Jan 2022 20:57:32 -0800 [thread overview]
Message-ID: <20220109045732.2497526-6-newren@gmail.com> (raw)
In-Reply-To: <20220109045732.2497526-1-newren@gmail.com>
Rather than lstat()'ing every SKIP_WORKTREE path, take advantage of the
fact that entire directories will often be missing, especially for cone
mode and even more so ever since commit 55dfcf9591 ("sparse-checkout:
clear tracked sparse dirs", 2021-09-08). If we have already determined
that the parent directory of a file (or any other previous ancestor)
does not exist, then we already know the file cannot exist and do not
need to lstat() it separately.
Granted, the cost of ensure_skip_worktree_means_skip_worktree() might
be considered a bit high for non-cone mode since it might now lstat()
every SKIP_WORKTREE path when the index is loaded (an O(N) cost, with
N the number of SKIP_WORKTREE paths), but non-cone mode users already
have to deal with the O(N*M) cost (with N=the number of tracked files
and M=the number of sparsity patterns), so this should be reasonable.
Signed-off-by: Elijah Newren <newren@gmail.com>
---
sparse-index.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 103 insertions(+), 2 deletions(-)
diff --git a/sparse-index.c b/sparse-index.c
index 79d50e444c..608782e255 100644
--- a/sparse-index.c
+++ b/sparse-index.c
@@ -341,18 +341,117 @@ void ensure_correct_sparsity(struct index_state *istate)
ensure_full_index(istate);
}
+struct path_cache_entry {
+ struct hashmap_entry ent;
+ const char *path;
+ int path_length;
+ int is_present;
+};
+
+static int path_cache_cmp(const void *unused,
+ const struct hashmap_entry *entry1,
+ const struct hashmap_entry *entry2,
+ const void *also_unused)
+{
+ const struct path_cache_entry *e1, *e2;
+
+ e1 = container_of(entry1, const struct path_cache_entry, ent);
+ e2 = container_of(entry2, const struct path_cache_entry, ent);
+ if (e1->path_length != e2->path_length)
+ return e1->path_length - e2->path_length;
+ return memcmp(e1->path, e2->path, e1->path_length);
+}
+
+static struct path_cache_entry *find_path_cache_entry(struct hashmap *map,
+ const char *str,
+ int str_length)
+{
+ struct path_cache_entry entry;
+ hashmap_entry_init(&entry.ent, memhash(str, str_length));
+ entry.path = str;
+ entry.path_length = str_length;
+ return hashmap_get_entry(map, &entry, ent, NULL);
+}
+
+static void record(struct hashmap *path_cache,
+ struct mem_pool *pool,
+ const char *path,
+ int path_length,
+ int found)
+{
+ struct path_cache_entry *entry;
+
+ entry = mem_pool_alloc(pool, sizeof(*entry));
+ hashmap_entry_init(&entry->ent, memhash(path, path_length));
+ entry->path = path;
+ entry->path_length = path_length;
+ entry->is_present = found;
+ hashmap_add(path_cache, &entry->ent);
+}
+
+static int path_found(struct hashmap *path_cache, struct mem_pool *pool,
+ const char *path, int path_length)
+{
+ struct stat st;
+ int found;
+ const char *dirsep = path + path_length - 1;
+ const char *tmp;
+
+ /* Find directory separator; memrchr is sadly glibc-specific */
+ while (dirsep > path && *dirsep != '/')
+ dirsep--;
+
+ /* If parent of path doesn't exist, no point lstat'ing path... */
+ if (dirsep > path) {
+ struct path_cache_entry *entry;
+ int new_length, parent_found;
+
+ /* First, check if path's parent's existence was cached */
+ new_length = dirsep - path;
+ entry = find_path_cache_entry(path_cache, path, new_length);
+ if (entry)
+ parent_found = entry->is_present;
+ else
+ parent_found = path_found(path_cache, pool,
+ path, new_length);
+
+ if (!parent_found) {
+ /* path can't exist if parent dir doesn't */
+ record(path_cache, pool, path, path_length, 0);
+ return 0;
+ } /* else parent was found so must check path itself too... */
+ }
+
+ /* Okay, parent dir exists, so we have to check original path */
+
+ /* Make sure we have a NUL-terminated string to pass to lstat */
+ tmp = path;
+ if (path[path_length])
+ tmp = mem_pool_strndup(pool, path, path_length);
+ /* Determine if path exists */
+ found = !lstat(tmp, &st);
+
+ record(path_cache, pool, path, path_length, found);
+ return found;
+}
+
void ensure_skip_worktree_means_skip_worktree(struct index_state *istate)
{
+ struct hashmap path_cache = HASHMAP_INIT(path_cache_cmp, NULL);
+ struct mem_pool pool;
+
int i;
+
if (!core_apply_sparse_checkout)
return;
+ mem_pool_init(&pool, 32*1024);
restart:
for (i = 0; i < istate->cache_nr; i++) {
struct cache_entry *ce = istate->cache[i];
- struct stat st;
- if (ce_skip_worktree(ce) && !lstat(ce->name, &st)) {
+ if (ce_skip_worktree(ce) &&
+ path_found(&path_cache, &pool, ce->name, strlen(ce->name))) {
if (S_ISSPARSEDIR(ce->ce_mode)) {
ensure_full_index(istate);
goto restart;
@@ -360,6 +459,8 @@ void ensure_skip_worktree_means_skip_worktree(struct index_state *istate)
ce->ce_flags &= ~CE_SKIP_WORKTREE;
}
}
+ hashmap_clear(&path_cache);
+ mem_pool_discard(&pool, 0);
}
--
2.34.1.442.ge63c19bdd2.dirty
next prev parent reply other threads:[~2022-01-09 4:57 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-01-09 4:57 [RFC PATCH 0/5] Remove the present-despite-SKIP_WORKTREE class of bugs Elijah Newren
2022-01-09 4:57 ` [RFC PATCH 1/5] t1011: add testcase demonstrating accidental loss of user modifications Elijah Newren
2022-01-09 4:57 ` [RFC PATCH 2/5] unpack-trees: fix accidental loss of user changes Elijah Newren
2022-01-09 4:57 ` [RFC PATCH 3/5] repo_read_index: ensure SKIP_WORKTREE means skip worktree Elijah Newren
2022-01-10 20:38 ` Victoria Dye
2022-01-11 19:27 ` Elijah Newren
2022-01-11 23:09 ` Victoria Dye
2022-01-09 4:57 ` [RFC PATCH 4/5] Update documentation related to sparsity and the skip-worktree bit Elijah Newren
2022-01-09 4:57 ` Elijah Newren [this message]
2022-01-11 18:30 ` [RFC PATCH 5/5] Accelerate ensure_skip_worktree_means_skip_worktree by caching Victoria Dye
2022-01-11 22:04 ` Elijah Newren
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220109045732.2497526-6-newren@gmail.com \
--to=newren@gmail.com \
--cc=git@vger.kernel.org \
--cc=lessleydennington@gmail.com \
--cc=stolee@gmail.com \
--cc=vdye@github.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).