From: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
To: git@vger.kernel.org
Cc: "Nguyễn Thái Ngọc Duy" <pclouds@gmail.com>
Subject: [PATCH v3 12/13] name-hash: allow to lookup a name with precalculated base hash
Date: Tue, 12 Mar 2013 20:04:59 +0700 [thread overview]
Message-ID: <1363093500-16796-13-git-send-email-pclouds@gmail.com> (raw)
In-Reply-To: <1363093500-16796-1-git-send-email-pclouds@gmail.com>
index_name_exists_base() differs from index_name_exists() in how the
hash is calculated. It uses the base hash as the seed and skips the
baselen part.
The intention is to reduce hashing cost during directory traversal,
where the hash of the directory is calculated once, and used as base
hash for all entries inside.
This poses a constraint in hash function. The function has not changed
since 2008. With luck it'll never change again. If resuming hashing at
any characters are not feasible with a new (better) hash function,
maybe we could stop at directory boundary.
Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@gmail.com>
---
Makefile | 1 +
cache.h | 2 --
dir.c | 1 +
name-hash.c | 48 ++++++++++++++++++++++--------------------------
name-hash.h (new) | 45 +++++++++++++++++++++++++++++++++++++++++++++
read-cache.c | 1 +
unpack-trees.c | 1 +
7 files changed, 71 insertions(+), 28 deletions(-)
create mode 100644 name-hash.h
diff --git a/Makefile b/Makefile
index 26d3332..8d753af 100644
--- a/Makefile
+++ b/Makefile
@@ -690,6 +690,7 @@ LIB_H += mailmap.h
LIB_H += merge-blobs.h
LIB_H += merge-recursive.h
LIB_H += mergesort.h
+LIB_H += name-hash.h
LIB_H += notes-cache.h
LIB_H += notes-merge.h
LIB_H += notes.h
diff --git a/cache.h b/cache.h
index e493563..cf33c67 100644
--- a/cache.h
+++ b/cache.h
@@ -313,7 +313,6 @@ static inline void remove_name_hash(struct cache_entry *ce)
#define refresh_cache(flags) refresh_index(&the_index, (flags), NULL, NULL, NULL)
#define ce_match_stat(ce, st, options) ie_match_stat(&the_index, (ce), (st), (options))
#define ce_modified(ce, st, options) ie_modified(&the_index, (ce), (st), (options))
-#define cache_name_exists(name, namelen, igncase) index_name_exists(&the_index, (name), (namelen), (igncase))
#define cache_name_is_other(name, namelen) index_name_is_other(&the_index, (name), (namelen))
#define resolve_undo_clear() resolve_undo_clear_index(&the_index)
#define unmerge_cache_entry_at(at) unmerge_index_entry_at(&the_index, at)
@@ -442,7 +441,6 @@ extern int write_index(struct index_state *, int newfd);
extern int discard_index(struct index_state *);
extern int unmerged_index(const struct index_state *);
extern int verify_path(const char *path);
-extern struct cache_entry *index_name_exists(struct index_state *istate, const char *name, int namelen, int igncase);
extern int index_name_pos(const struct index_state *, const char *name, int namelen);
#define ADD_CACHE_OK_TO_ADD 1 /* Ok to add */
#define ADD_CACHE_OK_TO_REPLACE 2 /* Ok to replace file/directory */
diff --git a/dir.c b/dir.c
index 6809dd2..5fda5af 100644
--- a/dir.c
+++ b/dir.c
@@ -11,6 +11,7 @@
#include "dir.h"
#include "refs.h"
#include "wildmatch.h"
+#include "name-hash.h"
#ifdef MEASURE_EXCLUDE
static uint32_t tv_treat_leading_path;
diff --git a/name-hash.c b/name-hash.c
index 1287a19..572d232 100644
--- a/name-hash.c
+++ b/name-hash.c
@@ -7,30 +7,7 @@
*/
#define NO_THE_INDEX_COMPATIBILITY_MACROS
#include "cache.h"
-
-/*
- * This removes bit 5 if bit 6 is set.
- *
- * That will make US-ASCII characters hash to their upper-case
- * equivalent. We could easily do this one whole word at a time,
- * but that's for future worries.
- */
-static inline unsigned char icase_hash(unsigned char c)
-{
- return c & ~((c & 0x40) >> 1);
-}
-
-static unsigned int hash_name(const char *name, int namelen)
-{
- unsigned int hash = 0x123;
-
- while (namelen--) {
- unsigned char c = *name++;
- c = icase_hash(c);
- hash = hash*101 + c;
- }
- return hash;
-}
+#include "name-hash.h"
static void hash_index_entry_directories(struct index_state *istate, struct cache_entry *ce)
{
@@ -152,9 +129,11 @@ static int same_name(const struct cache_entry *ce, const char *name, int namelen
return slow_same_name(name, namelen, ce->name, namelen < len ? namelen : len);
}
-struct cache_entry *index_name_exists(struct index_state *istate, const char *name, int namelen, int icase)
+static inline struct cache_entry *find_entry_by_hash(struct index_state *istate,
+ const char *name, int namelen,
+ unsigned int hash,
+ int icase)
{
- unsigned int hash = hash_name(name, namelen);
struct cache_entry *ce;
lazy_init_name_hash(istate);
@@ -189,3 +168,20 @@ struct cache_entry *index_name_exists(struct index_state *istate, const char *na
}
return NULL;
}
+
+struct cache_entry *index_name_exists(struct index_state *istate,
+ const char *name, int namelen,
+ int icase)
+{
+ unsigned int hash = hash_name(name, namelen);
+ return find_entry_by_hash(istate, name, namelen, hash, icase);
+}
+
+struct cache_entry *index_name_exists_base(struct index_state *istate,
+ unsigned int basehash, int baselen,
+ const char *name, int namelen,
+ int icase)
+{
+ unsigned int hash = hash_name_from(basehash, name + baselen, namelen - baselen);
+ return find_entry_by_hash(istate, name, namelen, hash, icase);
+}
diff --git a/name-hash.h b/name-hash.h
new file mode 100644
index 0000000..997de30
--- /dev/null
+++ b/name-hash.h
@@ -0,0 +1,45 @@
+#ifndef NAME_HASH_H
+#define NAME_HASH_H
+
+extern struct cache_entry *index_name_exists(struct index_state *istate,
+ const char *name, int namelen,
+ int igncase);
+extern struct cache_entry *index_name_exists_base(struct index_state *istate,
+ unsigned int basehash, int baselen,
+ const char *name, int namelen,
+ int igncase);
+
+static inline struct cache_entry *cache_name_exists(const char *name, int namelen, int igncase)
+{
+ return index_name_exists(&the_index, name, namelen, igncase);
+}
+
+/*
+ * This removes bit 5 if bit 6 is set.
+ *
+ * That will make US-ASCII characters hash to their upper-case
+ * equivalent. We could easily do this one whole word at a time,
+ * but that's for future worries.
+ */
+static inline unsigned char icase_hash(unsigned char c)
+{
+ return c & ~((c & 0x40) >> 1);
+}
+
+static inline unsigned int hash_name_from(unsigned int hash,
+ const char *name, int namelen)
+{
+ while (namelen--) {
+ unsigned char c = *name++;
+ c = icase_hash(c);
+ hash = hash*101 + c;
+ }
+ return hash;
+}
+
+static inline unsigned int hash_name(const char *name, int namelen)
+{
+ return hash_name_from(0x123, name, namelen);
+}
+
+#endif
diff --git a/read-cache.c b/read-cache.c
index 827ae55..8bd3ec8 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -14,6 +14,7 @@
#include "resolve-undo.h"
#include "strbuf.h"
#include "varint.h"
+#include "name-hash.h"
static struct cache_entry *refresh_cache_entry(struct cache_entry *ce, int really);
diff --git a/unpack-trees.c b/unpack-trees.c
index 09e53df..60fa5d4 100644
--- a/unpack-trees.c
+++ b/unpack-trees.c
@@ -8,6 +8,7 @@
#include "progress.h"
#include "refs.h"
#include "attr.h"
+#include "name-hash.h"
/*
* Error messages expected by scripts out of plumbing commands such as
--
1.8.1.2.536.gf441e6d
next prev parent reply other threads:[~2013-03-12 13:07 UTC|newest]
Thread overview: 48+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-03-09 4:09 [PATCH 0/3] Trivial (and small) exclude optimizations Nguyễn Thái Ngọc Duy
2013-03-09 4:09 ` [PATCH 1/3] match_pathname: avoid calling strncmp if baselen is 0 Nguyễn Thái Ngọc Duy
2013-03-09 9:06 ` Antoine Pelisse
2013-03-09 4:09 ` [PATCH 2/3] dir.c: inline convenient *_icase helpers Nguyễn Thái Ngọc Duy
2013-03-09 4:09 ` [PATCH 3/3] match_basename: use strncmp instead of strcmp Nguyễn Thái Ngọc Duy
2013-03-09 7:50 ` Junio C Hamano
2013-03-09 8:47 ` Fredrik Gustafsson
2013-03-09 9:58 ` Duy Nguyen
2013-03-10 6:14 ` [PATCH v2 0/6] Exclude optimizations Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 1/6] match_pathname: avoid calling strncmp if baselen is 0 Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 2/6] dir.c: inline convenient *_icase helpers Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 3/6] match_basename: use strncmp instead of strcmp Nguyễn Thái Ngọc Duy
2013-03-10 7:34 ` Junio C Hamano
2013-03-10 10:38 ` Duy Nguyen
2013-03-10 11:43 ` Antoine Pelisse
2013-03-10 11:54 ` Antoine Pelisse
2013-03-10 12:06 ` Duy Nguyen
2013-03-10 12:11 ` Antoine Pelisse
2013-03-10 12:14 ` Duy Nguyen
2013-03-12 20:59 ` Junio C Hamano
2013-03-13 1:11 ` Duy Nguyen
2013-03-10 6:14 ` [PATCH v2 4/6] match_{base,path}name: replace strncmp_icase with strnequal_icase Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 5/6] dir.c: pass pathname length to last_exclude_matching Nguyễn Thái Ngọc Duy
2013-03-10 6:14 ` [PATCH v2 6/6] exclude: filter patterns by directory level Nguyễn Thái Ngọc Duy
2013-03-10 8:20 ` Junio C Hamano
2013-03-10 10:18 ` Duy Nguyen
2013-03-10 10:58 ` Junio C Hamano
2013-03-10 11:14 ` Duy Nguyen
2013-03-11 15:11 ` [PATCH v2 0/6] Exclude optimizations Duy Nguyen
2013-03-12 13:04 ` [PATCH v3 00/13] " Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 01/13] dir.c: add MEASURE_EXCLUDE code for tracking exclude performance Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 02/13] match_pathname: avoid calling strncmp if baselen is 0 Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 03/13] dir.c: inline convenient *_icase helpers Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 04/13] match_basename: use strncmp instead of strcmp Nguyễn Thái Ngọc Duy
2013-03-12 17:40 ` Antoine Pelisse
2013-03-13 1:05 ` Duy Nguyen
2013-03-12 13:04 ` [PATCH v3 05/13] match_{base,path}name: replace strncmp_icase with memequal_icase Nguyễn Thái Ngọc Duy
2013-03-13 1:14 ` Duy Nguyen
2013-03-12 13:04 ` [PATCH v3 06/13] dir: pass pathname length to last_exclude_matching Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 07/13] exclude: avoid calling prep_exclude on entries of the same directory Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 08/13] exclude: record baselen in the pattern Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 09/13] exclude: filter out patterns not applicable to the current directory Nguyễn Thái Ngọc Duy
2013-03-12 23:13 ` Eric Sunshine
2013-03-12 13:04 ` [PATCH v3 10/13] read_directory: avoid invoking exclude machinery on tracked files Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` [PATCH v3 11/13] Preallocate hash tables when the number of inserts are known in advance Nguyễn Thái Ngọc Duy
2013-03-12 13:04 ` Nguyễn Thái Ngọc Duy [this message]
2013-03-12 13:05 ` [PATCH v3 13/13] read_directory: calculate name hashes incrementally Nguyễn Thái Ngọc Duy
2013-03-14 13:05 ` [PATCH v3 00/13] Exclude optimizations Duy Nguyen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1363093500-16796-13-git-send-email-pclouds@gmail.com \
--to=pclouds@gmail.com \
--cc=git@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).