[PATCHSET 0/3] xfs_db: create names with colliding hashes

public inbox for linux-xfs@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCHSET 0/3] xfs_db: create names with colliding hashes
@ 2023-06-05 15:36 Darrick J. Wong
  2023-06-05 15:37 ` [PATCH 1/3] xfs_db: hoist name obfuscation code out of metadump.c Darrick J. Wong
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Darrick J. Wong @ 2023-06-05 15:36 UTC (permalink / raw)
  To: djwong, cem; +Cc: linux-xfs

Hi all,

While we're on the topic of directory entry naming, create a couple of
new debugger commands to create directory or xattr names that have the
same name hash.  This enables further testing of that aspect of the
dabtree code, which in turn enables us to perform worst case performance
analysis of the parent pointers code.

If you're going to start using this mess, you probably ought to just
pull from my git trees, which are linked below.

This is an extraordinary way to destroy everything.  Enjoy!
Comments and questions are, as always, welcome.

--D

xfsprogs git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=db-hash-collisions
---
 db/Makefile       |    2 
 db/hash.c         |  418 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 db/metadump.c     |  383 -------------------------------------------------
 db/obfuscate.c    |  389 +++++++++++++++++++++++++++++++++++++++++++++++++
 db/obfuscate.h    |   17 ++
 man/man8/xfs_db.8 |   39 +++++
 6 files changed, 859 insertions(+), 389 deletions(-)
 create mode 100644 db/obfuscate.c
 create mode 100644 db/obfuscate.h

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/3] xfs_db: hoist name obfuscation code out of metadump.c
  2023-06-05 15:36 [PATCHSET 0/3] xfs_db: create names with colliding hashes Darrick J. Wong
@ 2023-06-05 15:37 ` Darrick J. Wong
  2023-06-06 11:17   ` Carlos Maiolino
  2023-06-15 16:12   ` [PATCH v2 " Darrick J. Wong
  2023-06-05 15:37 ` [PATCH 2/3] xfs_db: create dirents and xattrs with colliding names Darrick J. Wong
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 11+ messages in thread
From: Darrick J. Wong @ 2023-06-05 15:37 UTC (permalink / raw)
  To: djwong, cem; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

We want to create a debugger command that will create obfuscated names
for directory and xattr names, so hoist the name obfuscation code into a
separate file.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/Makefile    |    2 
 db/metadump.c  |  383 -------------------------------------------------------
 db/obfuscate.c |  389 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 db/obfuscate.h |   17 ++
 4 files changed, 408 insertions(+), 383 deletions(-)
 create mode 100644 db/obfuscate.c
 create mode 100644 db/obfuscate.h


diff --git a/db/Makefile b/db/Makefile
index b2e01174571..2f95f67075d 100644
--- a/db/Makefile
+++ b/db/Makefile
@@ -13,7 +13,7 @@ HFILES = addr.h agf.h agfl.h agi.h attr.h attrshort.h bit.h block.h bmap.h \
 	flist.h fprint.h frag.h freesp.h hash.h help.h init.h inode.h input.h \
 	io.h logformat.h malloc.h metadump.h output.h print.h quit.h sb.h \
 	sig.h strvec.h text.h type.h write.h attrset.h symlink.h fsmap.h \
-	fuzz.h
+	fuzz.h obfuscate.h
 CFILES = $(HFILES:.h=.c) btdump.c btheight.c convert.c info.c namei.c \
 	timelimit.c
 LSRCFILES = xfs_admin.sh xfs_ncheck.sh xfs_metadump.sh
diff --git a/db/metadump.c b/db/metadump.c
index 4f8b3adb163..d9a616a9296 100644
--- a/db/metadump.c
+++ b/db/metadump.c
@@ -19,6 +19,7 @@
 #include "faddr.h"
 #include "field.h"
 #include "dir2.h"
+#include "obfuscate.h"
 
 #define DEFAULT_MAX_EXT_SIZE	XFS_MAX_BMBT_EXTLEN
 
@@ -736,19 +737,6 @@ nametable_add(xfs_dahash_t hash, int namelen, unsigned char *name)
 	return ent;
 }
 
-#define is_invalid_char(c)	((c) == '/' || (c) == '\0')
-#define rol32(x,y)		(((x) << (y)) | ((x) >> (32 - (y))))
-
-static inline unsigned char
-random_filename_char(void)
-{
-	static unsigned char filename_alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-						"abcdefghijklmnopqrstuvwxyz"
-						"0123456789-_";
-
-	return filename_alphabet[random() % (sizeof filename_alphabet - 1)];
-}
-
 #define	ORPHANAGE	"lost+found"
 #define	ORPHANAGE_LEN	(sizeof (ORPHANAGE) - 1)
 
@@ -808,375 +796,6 @@ in_lost_found(
 	return slen == namelen && !memcmp(name, s, namelen);
 }
 
-/*
- * Given a name and its hash value, massage the name in such a way
- * that the result is another name of equal length which shares the
- * same hash value.
- */
-static void
-obfuscate_name(
-	xfs_dahash_t	hash,
-	size_t		name_len,
-	unsigned char	*name,
-	bool		is_dirent)
-{
-	unsigned char	*oldname = NULL;
-	unsigned char	*newp;
-	int		i;
-	xfs_dahash_t	new_hash;
-	unsigned char	*first;
-	unsigned char	high_bit;
-	int		tries = 0;
-	bool		is_ci_name = is_dirent && xfs_has_asciici(mp);
-	int		shift;
-
-	/*
-	 * Our obfuscation algorithm requires at least 5-character
-	 * names, so don't bother if the name is too short.  We
-	 * work backward from a hash value to determine the last
-	 * five bytes in a name required to produce a new name
-	 * with the same hash.
-	 */
-	if (name_len < 5)
-		return;
-
-	if (is_ci_name) {
-		oldname = alloca(name_len);
-		memcpy(oldname, name, name_len);
-	}
-
-again:
-	newp = name;
-	new_hash = 0;
-
-	/*
-	 * If we cannot generate a ci-compatible obfuscated name after 1000
-	 * tries, don't bother obfuscating the name.
-	 */
-	if (tries++ > 1000) {
-		memcpy(name, oldname, name_len);
-		return;
-	}
-
-	/*
-	 * The beginning of the obfuscated name can be pretty much
-	 * anything, so fill it in with random characters.
-	 * Accumulate its new hash value as we go.
-	 */
-	for (i = 0; i < name_len - 5; i++) {
-		*newp = random_filename_char();
-		if (is_ci_name)
-			new_hash = xfs_ascii_ci_xfrm(*newp) ^
-							rol32(new_hash, 7);
-		else
-			new_hash = *newp ^ rol32(new_hash, 7);
-		newp++;
-	}
-
-	/*
-	 * Compute which five bytes need to be used at the end of
-	 * the name so the hash of the obfuscated name is the same
-	 * as the hash of the original.  If any result in an invalid
-	 * character, flip a bit and arrange for a corresponding bit
-	 * in a neighboring byte to be flipped as well.  For the
-	 * last byte, the "neighbor" to change is the first byte
-	 * we're computing here.
-	 */
-	new_hash = rol32(new_hash, 3) ^ hash;
-
-	first = newp;
-	high_bit = 0;
-	for (shift = 28; shift >= 0; shift -= 7) {
-		*newp = (new_hash >> shift & 0x7f) ^ high_bit;
-		if (is_invalid_char(*newp)) {
-			*newp ^= 1;
-			high_bit = 0x80;
-		} else
-			high_bit = 0;
-
-		/*
-		 * If ascii-ci is enabled, uppercase characters are converted
-		 * to lowercase characters while computing the name hash.  If
-		 * any of the necessary correction bytes are uppercase, the
-		 * hash of the new name will not match.  Try again with a
-		 * different prefix.
-		 */
-		if (is_ci_name && xfs_ascii_ci_need_xfrm(*newp))
-			goto again;
-
-		ASSERT(!is_invalid_char(*newp));
-		newp++;
-	}
-
-	/*
-	 * If we flipped a bit on the last byte, we need to fix up
-	 * the matching bit in the first byte.  The result will
-	 * be a valid character, because we know that first byte
-	 * has 0's in its upper four bits (it was produced by a
-	 * 28-bit right-shift of a 32-bit unsigned value).
-	 */
-	if (high_bit) {
-		*first ^= 0x10;
-
-		if (is_ci_name && xfs_ascii_ci_need_xfrm(*first))
-			goto again;
-
-		ASSERT(!is_invalid_char(*first));
-	}
-}
-
-/*
- * Flip a bit in each of two bytes at the end of the given name.
- * This is used in generating a series of alternate names to be used
- * in the event a duplicate is found.
- *
- * The bits flipped are selected such that they both affect the same
- * bit in the name's computed hash value, so flipping them both will
- * preserve the hash.
- *
- * The following diagram aims to show the portion of a computed
- * hash that a given byte of a name affects.
- *
- *	   31    28      24    21	     14		  8 7       3     0
- *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
- * hash:   | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
- *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
- *	  last-4 ->|	       |<-- last-2 --->|	   |<--- last ---->|
- *		 |<-- last-3 --->|	     |<-- last-1 --->|     |<- last-4
- *			 |<-- last-7 --->|	     |<-- last-5 --->|
- *	   |<-- last-8 --->|	       |<-- last-6 --->|
- *			. . . and so on
- *
- * The last byte of the name directly affects the low-order byte of
- * the hash.  The next-to-last affects bits 7-14, the next one back
- * affects bits 14-21, and so on.  The effect wraps around when it
- * goes beyond the top of the hash (as happens for byte last-4).
- *
- * Bits that are flipped together "overlap" on the hash value.  As
- * an example of overlap, the last two bytes both affect bit 7 in
- * the hash.  That pair of bytes (and their overlapping bits) can be
- * used for this "flip bit" operation (it's the first pair tried,
- * actually).
- *
- * A table defines overlapping pairs--the bytes involved and bits
- * within them--that can be used this way.  The byte offset is
- * relative to a starting point within the name, which will be set
- * to affect the bytes at the end of the name.  The function is
- * called with a "bitseq" value which indicates which bit flip is
- * desired, and this translates directly into selecting which entry
- * in the bit_to_flip[] table to apply.
- *
- * The function returns 1 if the operation was successful.  It
- * returns 0 if the result produced a character that's not valid in
- * a name (either '/' or a '\0').  Finally, it returns -1 if the bit
- * sequence number is beyond what is supported for a name of this
- * length.
- *
- * Discussion
- * ----------
- * (Also see the discussion above find_alternate(), below.)
- *
- * In order to make this function work for any length name, the
- * table is ordered by increasing byte offset, so that the earliest
- * entries can apply to the shortest strings.  This way all names
- * are done consistently.
- *
- * When bit flips occur, they can convert printable characters
- * into non-printable ones.  In an effort to reduce the impact of
- * this, the first bit flips are chosen to affect bytes the end of
- * the name (and furthermore, toward the low bits of a byte).  Those
- * bytes are often non-printable anyway because of the way they are
- * initially selected by obfuscate_name()).  This is accomplished,
- * using later table entries first.
- *
- * Each row in the table doubles the number of alternates that
- * can be generated.  A two-byte name is limited to using only
- * the first row, so it's possible to generate two alternates
- * (the original name, plus the alternate produced by flipping
- * the one pair of bits).  In a 5-byte name, the effect of the
- * first byte overlaps the last by 4 its, and there are 8 bits
- * to flip, allowing for 256 possible alternates.
- *
- * Short names (less than 5 bytes) are never even obfuscated, so for
- * such names the relatively small number of alternates should never
- * really be a problem.
- *
- * Long names (more than 6 bytes, say) are not likely to exhaust
- * the number of available alternates.  In fact, the table could
- * probably have stopped at 8 entries, on the assumption that 256
- * alternates should be enough for most any situation.  The entries
- * beyond those are present mostly for demonstration of how it could
- * be populated with more entries, should it ever be necessary to do
- * so.
- */
-static int
-flip_bit(
-	size_t		name_len,
-	unsigned char	*name,
-	uint32_t	bitseq)
-{
-	int	index;
-	size_t	offset;
-	unsigned char *p0, *p1;
-	unsigned char m0, m1;
-	struct {
-	    int		byte;	/* Offset from start within name */
-	    unsigned char bit;	/* Bit within that byte */
-	} bit_to_flip[][2] = {	/* Sorted by second entry's byte */
-	    { { 0, 0 }, { 1, 7 } },	/* Each row defines a pair */
-	    { { 1, 0 }, { 2, 7 } },	/* of bytes and a bit within */
-	    { { 2, 0 }, { 3, 7 } },	/* each byte.  Each bit in */
-	    { { 0, 4 }, { 4, 0 } },	/* a pair affects the same */
-	    { { 0, 5 }, { 4, 1 } },	/* bit in the hash, so flipping */
-	    { { 0, 6 }, { 4, 2 } },	/* both will change the name */
-	    { { 0, 7 }, { 4, 3 } },	/* while preserving the hash. */
-	    { { 3, 0 }, { 4, 7 } },
-	    { { 0, 0 }, { 5, 3 } },	/* The first entry's byte offset */
-	    { { 0, 1 }, { 5, 4 } },	/* must be less than the second. */
-	    { { 0, 2 }, { 5, 5 } },
-	    { { 0, 3 }, { 5, 6 } },	/* The table can be extended to */
-	    { { 0, 4 }, { 5, 7 } },	/* an arbitrary number of entries */
-	    { { 4, 0 }, { 5, 7 } },	/* but there's not much point. */
-		/* . . . */
-	};
-
-	/* Find the first entry *not* usable for name of this length */
-
-	for (index = 0; index < ARRAY_SIZE(bit_to_flip); index++)
-		if (bit_to_flip[index][1].byte >= name_len)
-			break;
-
-	/*
-	 * Back up to the last usable entry.  If that number is
-	 * smaller than the bit sequence number, inform the caller
-	 * that nothing this large (or larger) will work.
-	 */
-	if (bitseq > --index)
-		return -1;
-
-	/*
-	 * We will be switching bits at the end of name, with a
-	 * preference for affecting the last bytes first.  Compute
-	 * where in the name we'll start applying the changes.
-	 */
-	offset = name_len - (bit_to_flip[index][1].byte + 1);
-	index -= bitseq;	/* Use later table entries first */
-
-	p0 = name + offset + bit_to_flip[index][0].byte;
-	p1 = name + offset + bit_to_flip[index][1].byte;
-	m0 = 1 << bit_to_flip[index][0].bit;
-	m1 = 1 << bit_to_flip[index][1].bit;
-
-	/* Only change the bytes if it produces valid characters */
-
-	if (is_invalid_char(*p0 ^ m0) || is_invalid_char(*p1 ^ m1))
-		return 0;
-
-	*p0 ^= m0;
-	*p1 ^= m1;
-
-	return 1;
-}
-
-/*
- * This function generates a well-defined sequence of "alternate"
- * names for a given name.  An alternate is a name having the same
- * length and same hash value as the original name.  This is needed
- * because the algorithm produces only one obfuscated name to use
- * for a given original name, and it's possible that result matches
- * a name already seen.  This function checks for this, and if it
- * occurs, finds another suitable obfuscated name to use.
- *
- * Each bit in the binary representation of the sequence number is
- * used to select one possible "bit flip" operation to perform on
- * the name.  So for example:
- *    seq = 0:	selects no bits to flip
- *    seq = 1:	selects the 0th bit to flip
- *    seq = 2:	selects the 1st bit to flip
- *    seq = 3:	selects the 0th and 1st bit to flip
- *    ... and so on.
- *
- * The flip_bit() function takes care of the details of the bit
- * flipping within the name.  Note that the "1st bit" in this
- * context is a bit sequence number; i.e. it doesn't necessarily
- * mean bit 0x02 will be changed.
- *
- * If a valid name (one that contains no '/' or '\0' characters) is
- * produced by this process for the given sequence number, this
- * function returns 1.  If the result is not valid, it returns 0.
- * Returns -1 if the sequence number is beyond the the maximum for
- * names of the given length.
- *
- *
- * Discussion
- * ----------
- * The number of alternates available for a given name is dependent
- * on its length.  A "bit flip" involves inverting two bits in
- * a name--the two bits being selected such that their values
- * affect the name's hash value in the same way.  Alternates are
- * thus generated by inverting the value of pairs of such
- * "overlapping" bits in the original name.  Each byte after the
- * first in a name adds at least one bit of overlap to work with.
- * (See comments above flip_bit() for more discussion on this.)
- *
- * So the number of alternates is dependent on the number of such
- * overlapping bits in a name.  If there are N bit overlaps, there
- * 2^N alternates for that hash value.
- *
- * Here are the number of overlapping bits available for generating
- * alternates for names of specific lengths:
- *	1	0	(must have 2 bytes to have any overlap)
- *	2	1	One bit overlaps--so 2 possible alternates
- *	3	2	Two bits overlap--so 4 possible alternates
- *	4	4	Three bits overlap, so 2^3 alternates
- *	5	8	8 bits overlap (due to wrapping), 256 alternates
- *	6	18	2^18 alternates
- *	7	28	2^28 alternates
- *	   ...
- * It's clear that the number of alternates grows very quickly with
- * the length of the name.  But note that the set of alternates
- * includes invalid names.  And for certain (contrived) names, the
- * number of valid names is a fairly small fraction of the total
- * number of alternates.
- *
- * The main driver for this infrastructure for coming up with
- * alternate names is really related to names 5 (or possibly 6)
- * bytes in length.  5-byte obfuscated names contain no randomly-
- * generated bytes in them, and the chance of an obfuscated name
- * matching an already-seen name is too high to just ignore.  This
- * methodical selection of alternates ensures we don't produce
- * duplicate names unless we have exhausted our options.
- */
-static int
-find_alternate(
-	size_t		name_len,
-	unsigned char	*name,
-	uint32_t	seq)
-{
-	uint32_t	bitseq = 0;
-	uint32_t	bits = seq;
-
-	if (!seq)
-		return 1;	/* alternate 0 is the original name */
-	if (name_len < 2)	/* Must have 2 bytes to flip */
-		return -1;
-
-	for (bitseq = 0; bits; bitseq++) {
-		uint32_t	mask = 1 << bitseq;
-		int		fb;
-
-		if (!(bits & mask))
-			continue;
-
-		fb = flip_bit(name_len, name, bitseq);
-		if (fb < 1)
-			return fb ? -1 : 0;
-		bits ^= mask;
-	}
-
-	return 1;
-}
-
 /*
  * Look up the given name in the name table.  If it is already
  * present, iterate through a well-defined sequence of alternate
diff --git a/db/obfuscate.c b/db/obfuscate.c
new file mode 100644
index 00000000000..249f22b52ce
--- /dev/null
+++ b/db/obfuscate.c
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2007, 2011 SGI
+ * All Rights Reserved.
+ */
+#include "libxfs.h"
+#include "init.h"
+#include "obfuscate.h"
+
+static inline unsigned char
+random_filename_char(void)
+{
+	static unsigned char filename_alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+						"abcdefghijklmnopqrstuvwxyz"
+						"0123456789-_";
+
+	return filename_alphabet[random() % (sizeof filename_alphabet - 1)];
+}
+
+#define rol32(x,y)		(((x) << (y)) | ((x) >> (32 - (y))))
+
+/*
+ * Given a name and its hash value, massage the name in such a way
+ * that the result is another name of equal length which shares the
+ * same hash value.
+ */
+void
+obfuscate_name(
+	xfs_dahash_t	hash,
+	size_t		name_len,
+	unsigned char	*name,
+	bool		is_dirent)
+{
+	unsigned char	*oldname = NULL;
+	unsigned char	*newp;
+	int		i;
+	xfs_dahash_t	new_hash;
+	unsigned char	*first;
+	unsigned char	high_bit;
+	int		tries = 0;
+	bool		is_ci_name = is_dirent && xfs_has_asciici(mp);
+	int		shift;
+
+	/*
+	 * Our obfuscation algorithm requires at least 5-character
+	 * names, so don't bother if the name is too short.  We
+	 * work backward from a hash value to determine the last
+	 * five bytes in a name required to produce a new name
+	 * with the same hash.
+	 */
+	if (name_len < 5)
+		return;
+
+	if (is_ci_name) {
+		oldname = alloca(name_len);
+		memcpy(oldname, name, name_len);
+	}
+
+again:
+	newp = name;
+	new_hash = 0;
+
+	/*
+	 * If we cannot generate a ci-compatible obfuscated name after 1000
+	 * tries, don't bother obfuscating the name.
+	 */
+	if (tries++ > 1000) {
+		memcpy(name, oldname, name_len);
+		return;
+	}
+
+	/*
+	 * The beginning of the obfuscated name can be pretty much
+	 * anything, so fill it in with random characters.
+	 * Accumulate its new hash value as we go.
+	 */
+	for (i = 0; i < name_len - 5; i++) {
+		*newp = random_filename_char();
+		if (is_ci_name)
+			new_hash = xfs_ascii_ci_xfrm(*newp) ^
+							rol32(new_hash, 7);
+		else
+			new_hash = *newp ^ rol32(new_hash, 7);
+		newp++;
+	}
+
+	/*
+	 * Compute which five bytes need to be used at the end of
+	 * the name so the hash of the obfuscated name is the same
+	 * as the hash of the original.  If any result in an invalid
+	 * character, flip a bit and arrange for a corresponding bit
+	 * in a neighboring byte to be flipped as well.  For the
+	 * last byte, the "neighbor" to change is the first byte
+	 * we're computing here.
+	 */
+	new_hash = rol32(new_hash, 3) ^ hash;
+
+	first = newp;
+	high_bit = 0;
+	for (shift = 28; shift >= 0; shift -= 7) {
+		*newp = (new_hash >> shift & 0x7f) ^ high_bit;
+		if (is_invalid_char(*newp)) {
+			*newp ^= 1;
+			high_bit = 0x80;
+		} else
+			high_bit = 0;
+
+		/*
+		 * If ascii-ci is enabled, uppercase characters are converted
+		 * to lowercase characters while computing the name hash.  If
+		 * any of the necessary correction bytes are uppercase, the
+		 * hash of the new name will not match.  Try again with a
+		 * different prefix.
+		 */
+		if (is_ci_name && xfs_ascii_ci_need_xfrm(*newp))
+			goto again;
+
+		ASSERT(!is_invalid_char(*newp));
+		newp++;
+	}
+
+	/*
+	 * If we flipped a bit on the last byte, we need to fix up
+	 * the matching bit in the first byte.  The result will
+	 * be a valid character, because we know that first byte
+	 * has 0's in its upper four bits (it was produced by a
+	 * 28-bit right-shift of a 32-bit unsigned value).
+	 */
+	if (high_bit) {
+		*first ^= 0x10;
+
+		if (is_ci_name && xfs_ascii_ci_need_xfrm(*first))
+			goto again;
+
+		ASSERT(!is_invalid_char(*first));
+	}
+}
+
+/*
+ * Flip a bit in each of two bytes at the end of the given name.
+ * This is used in generating a series of alternate names to be used
+ * in the event a duplicate is found.
+ *
+ * The bits flipped are selected such that they both affect the same
+ * bit in the name's computed hash value, so flipping them both will
+ * preserve the hash.
+ *
+ * The following diagram aims to show the portion of a computed
+ * hash that a given byte of a name affects.
+ *
+ *	   31    28      24    21	     14		  8 7       3     0
+ *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
+ * hash:   | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+ *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
+ *	  last-4 ->|	       |<-- last-2 --->|	   |<--- last ---->|
+ *		 |<-- last-3 --->|	     |<-- last-1 --->|     |<- last-4
+ *			 |<-- last-7 --->|	     |<-- last-5 --->|
+ *	   |<-- last-8 --->|	       |<-- last-6 --->|
+ *			. . . and so on
+ *
+ * The last byte of the name directly affects the low-order byte of
+ * the hash.  The next-to-last affects bits 7-14, the next one back
+ * affects bits 14-21, and so on.  The effect wraps around when it
+ * goes beyond the top of the hash (as happens for byte last-4).
+ *
+ * Bits that are flipped together "overlap" on the hash value.  As
+ * an example of overlap, the last two bytes both affect bit 7 in
+ * the hash.  That pair of bytes (and their overlapping bits) can be
+ * used for this "flip bit" operation (it's the first pair tried,
+ * actually).
+ *
+ * A table defines overlapping pairs--the bytes involved and bits
+ * within them--that can be used this way.  The byte offset is
+ * relative to a starting point within the name, which will be set
+ * to affect the bytes at the end of the name.  The function is
+ * called with a "bitseq" value which indicates which bit flip is
+ * desired, and this translates directly into selecting which entry
+ * in the bit_to_flip[] table to apply.
+ *
+ * The function returns 1 if the operation was successful.  It
+ * returns 0 if the result produced a character that's not valid in
+ * a name (either '/' or a '\0').  Finally, it returns -1 if the bit
+ * sequence number is beyond what is supported for a name of this
+ * length.
+ *
+ * Discussion
+ * ----------
+ * (Also see the discussion above find_alternate(), below.)
+ *
+ * In order to make this function work for any length name, the
+ * table is ordered by increasing byte offset, so that the earliest
+ * entries can apply to the shortest strings.  This way all names
+ * are done consistently.
+ *
+ * When bit flips occur, they can convert printable characters
+ * into non-printable ones.  In an effort to reduce the impact of
+ * this, the first bit flips are chosen to affect bytes the end of
+ * the name (and furthermore, toward the low bits of a byte).  Those
+ * bytes are often non-printable anyway because of the way they are
+ * initially selected by obfuscate_name()).  This is accomplished,
+ * using later table entries first.
+ *
+ * Each row in the table doubles the number of alternates that
+ * can be generated.  A two-byte name is limited to using only
+ * the first row, so it's possible to generate two alternates
+ * (the original name, plus the alternate produced by flipping
+ * the one pair of bits).  In a 5-byte name, the effect of the
+ * first byte overlaps the last by 4 its, and there are 8 bits
+ * to flip, allowing for 256 possible alternates.
+ *
+ * Short names (less than 5 bytes) are never even obfuscated, so for
+ * such names the relatively small number of alternates should never
+ * really be a problem.
+ *
+ * Long names (more than 6 bytes, say) are not likely to exhaust
+ * the number of available alternates.  In fact, the table could
+ * probably have stopped at 8 entries, on the assumption that 256
+ * alternates should be enough for most any situation.  The entries
+ * beyond those are present mostly for demonstration of how it could
+ * be populated with more entries, should it ever be necessary to do
+ * so.
+ */
+static int
+flip_bit(
+	size_t		name_len,
+	unsigned char	*name,
+	uint32_t	bitseq)
+{
+	int	index;
+	size_t	offset;
+	unsigned char *p0, *p1;
+	unsigned char m0, m1;
+	struct {
+	    int		byte;	/* Offset from start within name */
+	    unsigned char bit;	/* Bit within that byte */
+	} bit_to_flip[][2] = {	/* Sorted by second entry's byte */
+	    { { 0, 0 }, { 1, 7 } },	/* Each row defines a pair */
+	    { { 1, 0 }, { 2, 7 } },	/* of bytes and a bit within */
+	    { { 2, 0 }, { 3, 7 } },	/* each byte.  Each bit in */
+	    { { 0, 4 }, { 4, 0 } },	/* a pair affects the same */
+	    { { 0, 5 }, { 4, 1 } },	/* bit in the hash, so flipping */
+	    { { 0, 6 }, { 4, 2 } },	/* both will change the name */
+	    { { 0, 7 }, { 4, 3 } },	/* while preserving the hash. */
+	    { { 3, 0 }, { 4, 7 } },
+	    { { 0, 0 }, { 5, 3 } },	/* The first entry's byte offset */
+	    { { 0, 1 }, { 5, 4 } },	/* must be less than the second. */
+	    { { 0, 2 }, { 5, 5 } },
+	    { { 0, 3 }, { 5, 6 } },	/* The table can be extended to */
+	    { { 0, 4 }, { 5, 7 } },	/* an arbitrary number of entries */
+	    { { 4, 0 }, { 5, 7 } },	/* but there's not much point. */
+		/* . . . */
+	};
+
+	/* Find the first entry *not* usable for name of this length */
+
+	for (index = 0; index < ARRAY_SIZE(bit_to_flip); index++)
+		if (bit_to_flip[index][1].byte >= name_len)
+			break;
+
+	/*
+	 * Back up to the last usable entry.  If that number is
+	 * smaller than the bit sequence number, inform the caller
+	 * that nothing this large (or larger) will work.
+	 */
+	if (bitseq > --index)
+		return -1;
+
+	/*
+	 * We will be switching bits at the end of name, with a
+	 * preference for affecting the last bytes first.  Compute
+	 * where in the name we'll start applying the changes.
+	 */
+	offset = name_len - (bit_to_flip[index][1].byte + 1);
+	index -= bitseq;	/* Use later table entries first */
+
+	p0 = name + offset + bit_to_flip[index][0].byte;
+	p1 = name + offset + bit_to_flip[index][1].byte;
+	m0 = 1 << bit_to_flip[index][0].bit;
+	m1 = 1 << bit_to_flip[index][1].bit;
+
+	/* Only change the bytes if it produces valid characters */
+
+	if (is_invalid_char(*p0 ^ m0) || is_invalid_char(*p1 ^ m1))
+		return 0;
+
+	*p0 ^= m0;
+	*p1 ^= m1;
+
+	return 1;
+}
+
+/*
+ * This function generates a well-defined sequence of "alternate"
+ * names for a given name.  An alternate is a name having the same
+ * length and same hash value as the original name.  This is needed
+ * because the algorithm produces only one obfuscated name to use
+ * for a given original name, and it's possible that result matches
+ * a name already seen.  This function checks for this, and if it
+ * occurs, finds another suitable obfuscated name to use.
+ *
+ * Each bit in the binary representation of the sequence number is
+ * used to select one possible "bit flip" operation to perform on
+ * the name.  So for example:
+ *    seq = 0:	selects no bits to flip
+ *    seq = 1:	selects the 0th bit to flip
+ *    seq = 2:	selects the 1st bit to flip
+ *    seq = 3:	selects the 0th and 1st bit to flip
+ *    ... and so on.
+ *
+ * The flip_bit() function takes care of the details of the bit
+ * flipping within the name.  Note that the "1st bit" in this
+ * context is a bit sequence number; i.e. it doesn't necessarily
+ * mean bit 0x02 will be changed.
+ *
+ * If a valid name (one that contains no '/' or '\0' characters) is
+ * produced by this process for the given sequence number, this
+ * function returns 1.  If the result is not valid, it returns 0.
+ * Returns -1 if the sequence number is beyond the the maximum for
+ * names of the given length.
+ *
+ *
+ * Discussion
+ * ----------
+ * The number of alternates available for a given name is dependent
+ * on its length.  A "bit flip" involves inverting two bits in
+ * a name--the two bits being selected such that their values
+ * affect the name's hash value in the same way.  Alternates are
+ * thus generated by inverting the value of pairs of such
+ * "overlapping" bits in the original name.  Each byte after the
+ * first in a name adds at least one bit of overlap to work with.
+ * (See comments above flip_bit() for more discussion on this.)
+ *
+ * So the number of alternates is dependent on the number of such
+ * overlapping bits in a name.  If there are N bit overlaps, there
+ * 2^N alternates for that hash value.
+ *
+ * Here are the number of overlapping bits available for generating
+ * alternates for names of specific lengths:
+ *	1	0	(must have 2 bytes to have any overlap)
+ *	2	1	One bit overlaps--so 2 possible alternates
+ *	3	2	Two bits overlap--so 4 possible alternates
+ *	4	4	Three bits overlap, so 2^3 alternates
+ *	5	8	8 bits overlap (due to wrapping), 256 alternates
+ *	6	18	2^18 alternates
+ *	7	28	2^28 alternates
+ *	   ...
+ * It's clear that the number of alternates grows very quickly with
+ * the length of the name.  But note that the set of alternates
+ * includes invalid names.  And for certain (contrived) names, the
+ * number of valid names is a fairly small fraction of the total
+ * number of alternates.
+ *
+ * The main driver for this infrastructure for coming up with
+ * alternate names is really related to names 5 (or possibly 6)
+ * bytes in length.  5-byte obfuscated names contain no randomly-
+ * generated bytes in them, and the chance of an obfuscated name
+ * matching an already-seen name is too high to just ignore.  This
+ * methodical selection of alternates ensures we don't produce
+ * duplicate names unless we have exhausted our options.
+ */
+int
+find_alternate(
+	size_t		name_len,
+	unsigned char	*name,
+	uint32_t	seq)
+{
+	uint32_t	bitseq = 0;
+	uint32_t	bits = seq;
+
+	if (!seq)
+		return 1;	/* alternate 0 is the original name */
+	if (name_len < 2)	/* Must have 2 bytes to flip */
+		return -1;
+
+	for (bitseq = 0; bits; bitseq++) {
+		uint32_t	mask = 1 << bitseq;
+		int		fb;
+
+		if (!(bits & mask))
+			continue;
+
+		fb = flip_bit(name_len, name, bitseq);
+		if (fb < 1)
+			return fb ? -1 : 0;
+		bits ^= mask;
+	}
+
+	return 1;
+}
diff --git a/db/obfuscate.h b/db/obfuscate.h
new file mode 100644
index 00000000000..afaaca37154
--- /dev/null
+++ b/db/obfuscate.h
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2007, 2011 SGI
+ * All Rights Reserved.
+ */
+#ifndef __DB_OBFUSCATE_H__
+#define __DB_OBFUSCATE_H__
+
+/* Routines to obfuscate directory filenames and xattr names. */
+
+#define is_invalid_char(c)	((c) == '/' || (c) == '\0')
+
+void obfuscate_name(xfs_dahash_t hash, size_t name_len, unsigned char *name,
+		bool is_dirent);
+int find_alternate(size_t name_len, unsigned char *name, uint32_t seq);
+
+#endif /* __DB_OBFUSCATE_H__ */


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/3] xfs_db: create dirents and xattrs with colliding names
  2023-06-05 15:36 [PATCHSET 0/3] xfs_db: create names with colliding hashes Darrick J. Wong
  2023-06-05 15:37 ` [PATCH 1/3] xfs_db: hoist name obfuscation code out of metadump.c Darrick J. Wong
@ 2023-06-05 15:37 ` Darrick J. Wong
  2023-06-06 11:33   ` Carlos Maiolino
  2023-06-14 11:32   ` Carlos Maiolino
  2023-06-05 15:37 ` [PATCH 3/3] xfs_db: make the hash command print the dirent hash Darrick J. Wong
  2023-06-05 20:22 ` [PATCHSET 0/3] xfs_db: create names with colliding hashes Andrey Albershteyn
  3 siblings, 2 replies; 11+ messages in thread
From: Darrick J. Wong @ 2023-06-05 15:37 UTC (permalink / raw)
  To: djwong, cem; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

Create a new debugger command that will create dirent and xattr names
that induce dahash collisions.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/hash.c         |  376 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 man/man8/xfs_db.8 |   31 ++++
 2 files changed, 407 insertions(+)


diff --git a/db/hash.c b/db/hash.c
index 68c53e7f9bc..79a250526e9 100644
--- a/db/hash.c
+++ b/db/hash.c
@@ -5,12 +5,15 @@
  */
 
 #include "libxfs.h"
+#include "init.h"
 #include "addr.h"
 #include "command.h"
 #include "type.h"
 #include "io.h"
 #include "output.h"
 #include "hash.h"
+#include "obfuscate.h"
+#include <sys/xattr.h>
 
 static int hash_f(int argc, char **argv);
 static void hash_help(void);
@@ -46,8 +49,381 @@ hash_f(
 	return 0;
 }
 
+static void
+hashcoll_help(void)
+{
+	printf(_(
+"\n"
+" Generate obfuscated variants of the provided name.  Each variant will have\n"
+" the same dahash value.  Names are written to stdout with a NULL separating\n"
+" each name.\n"
+"\n"
+" -a -- create extended attributes.\n"
+" -i -- read standard input for the name, up to %d bytes.\n"
+" -n -- create this many names.\n"
+" -p -- create directory entries or extended attributes in this file.\n"
+" -s -- seed the rng with this value.\n"
+"\n"),
+			MAXNAMELEN - 1);
+}
+
+struct name_dup {
+	struct name_dup	*next;
+	uint32_t	crc;
+	uint8_t		namelen;
+	uint8_t		name[];
+};
+
+static inline size_t
+name_dup_sizeof(
+	unsigned int	namelen)
+{
+	return sizeof(struct name_dup) + namelen;
+}
+
+#define MAX_DUP_TABLE_BUCKETS	(1048575)
+
+struct dup_table {
+	unsigned int	nr_buckets;
+	struct name_dup	*buckets[];
+};
+
+static inline size_t
+dup_table_sizeof(
+	unsigned int	nr_buckets)
+{
+	return sizeof(struct dup_table) +
+				(nr_buckets * sizeof(struct name_dup *));
+}
+
+static int
+dup_table_alloc(
+	unsigned long		nr_names,
+	struct dup_table	**tabp)
+{
+	struct dup_table	*t;
+
+	*tabp = NULL;
+
+	if (nr_names == 1)
+		return 0;
+
+	nr_names = min(MAX_DUP_TABLE_BUCKETS, nr_names);
+	t = calloc(1, dup_table_sizeof(nr_names));
+	if (!t)
+		return ENOMEM;
+
+	t->nr_buckets = nr_names;
+	*tabp = t;
+	return 0;
+}
+
+static void
+dup_table_free(
+	struct dup_table	*tab)
+{
+	struct name_dup		*ent, *next;
+	unsigned int		i;
+
+	if (!tab)
+		return;
+
+	for (i = 0; i < tab->nr_buckets; i++) {
+		ent = tab->buckets[i];
+
+		while (ent) {
+			next = ent->next;
+			free(ent);
+			ent = next;
+		}
+	}
+	free(tab);
+}
+
+static struct name_dup *
+dup_table_find(
+	struct dup_table	*tab,
+	unsigned char		*name,
+	size_t			namelen)
+{
+	struct name_dup		*ent;
+	uint32_t		crc = crc32c(~0, name, namelen);
+
+	ent = tab->buckets[crc % tab->nr_buckets];
+	while (ent) {
+		if (ent->crc == crc &&
+		    ent->namelen == namelen &&
+		    !memcmp(ent->name, name, namelen))
+			return ent;
+
+		ent = ent->next;
+	}
+
+	return NULL;
+}
+
+static int
+dup_table_store(
+	struct dup_table	*tab,
+	unsigned char		*name,
+	size_t			namelen)
+{
+	struct name_dup		*dup;
+	uint32_t		seq = 1;
+
+	ASSERT(namelen < MAXNAMELEN);
+
+	while ((dup = dup_table_find(tab, name, namelen)) != NULL) {
+		int		ret;
+
+		do {
+			ret = find_alternate(namelen, name, seq++);
+		} while (ret == 0);
+		if (ret < 0)
+			return EEXIST;
+	}
+
+	dup = malloc(name_dup_sizeof(namelen));
+	if (!dup)
+		return ENOMEM;
+
+	dup->crc = crc32c(~0, name, namelen);
+	dup->namelen = namelen;
+	memcpy(dup->name, name, namelen);
+	dup->next = tab->buckets[dup->crc % tab->nr_buckets];
+
+	tab->buckets[dup->crc % tab->nr_buckets] = dup;
+	return 0;
+}
+
+static int
+collide_dirents(
+	unsigned long		nr,
+	const unsigned char	*name,
+	size_t			namelen,
+	int			fd)
+{
+	struct xfs_name		dname = {
+		.name		= name,
+		.len		= namelen,
+	};
+	unsigned char		direntname[MAXNAMELEN + 1];
+	struct dup_table	*tab = NULL;
+	xfs_dahash_t		old_hash;
+	unsigned long		i;
+	int			error = 0;
+
+	old_hash = libxfs_dir2_hashname(mp, &dname);
+
+	if (fd >= 0) {
+		int		newfd;
+
+		/*
+		 * User passed in a fd, so we'll use the directory to detect
+		 * duplicate names.  First create the name that we are passed
+		 * in; the new names will be hardlinks to the first file.
+		 */
+		newfd = openat(fd, name, O_CREAT, 0600);
+		if (newfd < 0)
+			return errno;
+		close(newfd);
+	} else if (nr > 1) {
+		/*
+		 * Track every name we create so that we don't emit duplicates.
+		 */
+		error = dup_table_alloc(nr, &tab);
+		if (error)
+			return error;
+	}
+
+	dname.name = direntname;
+	for (i = 0; i < nr; i++) {
+		strncpy(direntname, name, MAXNAMELEN);
+		obfuscate_name(old_hash, namelen, direntname, true);
+		ASSERT(old_hash == libxfs_dir2_hashname(mp, &dname));
+
+		if (fd >= 0) {
+			error = linkat(fd, name, fd, direntname, 0);
+			if (error && errno != EEXIST)
+				return errno;
+
+			/* don't print names to stdout */
+			continue;
+		} else if (tab) {
+			error = dup_table_store(tab, direntname, namelen);
+			if (error)
+				break;
+		}
+
+		printf("%s%c", direntname, 0);
+	}
+
+	dup_table_free(tab);
+	return error;
+}
+
+static int
+collide_xattrs(
+	unsigned long		nr,
+	const unsigned char	*name,
+	size_t			namelen,
+	int			fd)
+{
+	unsigned char		xattrname[MAXNAMELEN + 5];
+	struct dup_table	*tab = NULL;
+	xfs_dahash_t		old_hash;
+	unsigned long		i;
+	int			error;
+
+	old_hash = libxfs_da_hashname(name, namelen);
+
+	if (fd >= 0) {
+		/*
+		 * User passed in a fd, so we'll use the xattr structure to
+		 * detect duplicate names.  First create the attribute that we
+		 * are passed in.
+		 */
+		snprintf(xattrname, MAXNAMELEN + 5, "user.%s", name);
+		error = fsetxattr(fd, xattrname, "1", 1, 0);
+		if (error)
+			return errno;
+	} else if (nr > 1) {
+		/*
+		 * Track every name we create so that we don't emit duplicates.
+		 */
+		error = dup_table_alloc(nr, &tab);
+		if (error)
+			return error;
+	}
+
+	for (i = 0; i < nr; i++) {
+		snprintf(xattrname, MAXNAMELEN + 5, "user.%s", name);
+		obfuscate_name(old_hash, namelen, xattrname + 5, false);
+		ASSERT(old_hash == libxfs_da_hashname(xattrname + 5, namelen));
+
+		if (fd >= 0) {
+			error = fsetxattr(fd, xattrname, "1", 1, 0);
+			if (error)
+				return errno;
+
+			/* don't print names to stdout */
+			continue;
+		} else if (tab) {
+			error = dup_table_store(tab, xattrname, namelen + 5);
+			if (error)
+				break;
+		}
+
+		printf("%s%c", xattrname, 0);
+	}
+
+	dup_table_free(tab);
+	return error;
+}
+
+static int
+hashcoll_f(
+	int		argc,
+	char		**argv)
+{
+	const char	*path = NULL;
+	bool		read_stdin = false;
+	bool		create_xattr = false;
+	unsigned long	nr = 1, seed = 0;
+	int		fd = -1;
+	int		c;
+	int		error;
+
+	while ((c = getopt(argc, argv, "ain:p:s:")) != EOF) {
+		switch (c) {
+		case 'a':
+			create_xattr = true;
+			break;
+		case 'i':
+			read_stdin = true;
+			break;
+		case 'n':
+			nr = strtoul(optarg, NULL, 10);
+			break;
+		case 'p':
+			path = optarg;
+			break;
+		case 's':
+			seed = strtoul(optarg, NULL, 10);
+			break;
+		default:
+			exitcode = 1;
+			hashcoll_help();
+			return 0;
+		}
+	}
+
+	if (path) {
+		int	oflags = O_RDWR;
+
+		if (!create_xattr)
+			oflags = O_RDONLY | O_DIRECTORY;
+
+		fd = open(path, oflags);
+		if (fd < 0) {
+			perror(path);
+			exitcode = 1;
+			return 0;
+		}
+	}
+
+	if (seed)
+		srandom(seed);
+
+	if (read_stdin) {
+		char	buf[MAXNAMELEN];
+		size_t	len;
+
+		len = fread(buf, 1, MAXNAMELEN - 1, stdin);
+
+		if (create_xattr)
+			error = collide_xattrs(nr, buf, len, fd);
+		else
+			error = collide_dirents(nr, buf, len, fd);
+		if (error) {
+			printf(_("hashcoll: %s\n"), strerror(error));
+			exitcode = 1;
+		}
+		goto done;
+	}
+
+	for (c = optind; c < argc; c++) {
+		size_t	len = strlen(argv[c]);
+
+		if (create_xattr)
+			error = collide_xattrs(nr, argv[c], len, fd);
+		else
+			error = collide_dirents(nr, argv[c], len, fd);
+		if (error) {
+			printf(_("hashcoll: %s\n"), strerror(error));
+			exitcode = 1;
+		}
+	}
+
+done:
+	if (fd >= 0)
+		close(fd);
+	return 0;
+}
+
+static cmdinfo_t	hashcoll_cmd = {
+	.name		= "hashcoll",
+	.cfunc		= hashcoll_f,
+	.argmin		= 0,
+	.argmax		= -1,
+	.args		= N_("[-a] [-s seed] [-n nr] [-p path] -i|names..."),
+	.oneline	= N_("create names that produce dahash collisions"),
+	.help		= hashcoll_help,
+};
+
 void
 hash_init(void)
 {
 	add_command(&hash_cmd);
+	add_command(&hashcoll_cmd);
 }
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index 1a2bb7e98fb..fde1c5c6c69 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -768,6 +768,37 @@ Prints the hash value of
 .I string
 using the hash function of the XFS directory and attribute implementation.
 .TP
+.BI "hashcoll [-a] [-s seed] [-n " nr "] [-p " path "] -i | " names...
+Create directory entries or extended attributes names that all have the same
+hash value.
+The metadump name obfuscation algorithm is used here.
+Names are written to standard output, with a NULL between each name for use
+with xargs -0.
+.RS 1.0i
+.PD 0
+.TP 0.4i
+.TP 0.4i
+.B \-a
+Create extended attribute names.
+.TP 0.4i
+.B \-i
+Read the first name to create from standard input.
+Up to 255 bytes are read.
+If this option is not specified, first names are taken from the command line.
+.TP 0.4i
+.BI \-n " nr"
+Create this many duplicated names.
+The default is to create one name.
+.TP 0.4i
+.BI \-p " path"
+Create directory entries or extended attributes in this file instead of
+writing the names to standard output.
+.TP 0.4i
+.BI \-s " seed"
+Seed the random number generator with this value.
+.PD
+.RE
+.TP
 .BI "help [" command ]
 Print help for one or all commands.
 .TP


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 3/3] xfs_db: make the hash command print the dirent hash
  2023-06-05 15:36 [PATCHSET 0/3] xfs_db: create names with colliding hashes Darrick J. Wong
  2023-06-05 15:37 ` [PATCH 1/3] xfs_db: hoist name obfuscation code out of metadump.c Darrick J. Wong
  2023-06-05 15:37 ` [PATCH 2/3] xfs_db: create dirents and xattrs with colliding names Darrick J. Wong
@ 2023-06-05 15:37 ` Darrick J. Wong
  2023-06-06 11:36   ` Carlos Maiolino
  2023-06-05 20:22 ` [PATCHSET 0/3] xfs_db: create names with colliding hashes Andrey Albershteyn
  3 siblings, 1 reply; 11+ messages in thread
From: Darrick J. Wong @ 2023-06-05 15:37 UTC (permalink / raw)
  To: djwong, cem; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

It turns out that the da and dir2 hashname functions are /not/ the same,
at least not on ascii-ci filesystems.  Enhance this debugger command to
support printing the dir2 hashname.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 db/hash.c         |   42 +++++++++++++++++++++++++++++++++++++-----
 man/man8/xfs_db.8 |    8 +++++++-
 2 files changed, 44 insertions(+), 6 deletions(-)


diff --git a/db/hash.c b/db/hash.c
index 79a250526e9..716da88baf9 100644
--- a/db/hash.c
+++ b/db/hash.c
@@ -18,9 +18,15 @@
 static int hash_f(int argc, char **argv);
 static void hash_help(void);
 
-static const cmdinfo_t hash_cmd =
-	{ "hash", NULL, hash_f, 1, 1, 0, N_("string"),
-	  N_("calculate hash value"), hash_help };
+static const cmdinfo_t hash_cmd = {
+	.name		= "hash",
+	.cfunc		= hash_f,
+	.argmin		= 1,
+	.argmax		= -1,
+	.args		= N_("string"),
+	.oneline	= N_("calculate hash value"),
+	.help		= hash_help,
+};
 
 static void
 hash_help(void)
@@ -43,9 +49,35 @@ hash_f(
 	char		**argv)
 {
 	xfs_dahash_t	hashval;
+	bool		use_dir2_hash = false;
+	int		c;
+
+	while ((c = getopt(argc, argv, "d")) != EOF) {
+		switch (c) {
+		case 'd':
+			use_dir2_hash = true;
+			break;
+		default:
+			exitcode = 1;
+			hash_help();
+			return 0;
+		}
+	}
+
+	for (c = optind; c < argc; c++) {
+		if (use_dir2_hash) {
+			struct xfs_name	xname = {
+				.name	= (uint8_t *)argv[c],
+				.len	= strlen(argv[c]),
+			};
+
+			hashval = libxfs_dir2_hashname(mp, &xname);
+		} else {
+			hashval = libxfs_da_hashname(argv[c], strlen(argv[c]));
+		}
+		dbprintf("0x%x\n", hashval);
+	}
 
-	hashval = libxfs_da_hashname((unsigned char *)argv[1], (int)strlen(argv[1]));
-	dbprintf("0x%x\n", hashval);
 	return 0;
 }
 
diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
index fde1c5c6c69..60dcdc52cba 100644
--- a/man/man8/xfs_db.8
+++ b/man/man8/xfs_db.8
@@ -763,10 +763,16 @@ Skip write verifiers but perform CRC recalculation; allows invalid data to be
 written to disk to test detection of invalid data.
 .RE
 .TP
-.BI hash " string
+.BI hash [-d]" strings
 Prints the hash value of
 .I string
 using the hash function of the XFS directory and attribute implementation.
+
+If the
+.B \-d
+option is specified, the directory-specific hash function is used.
+This only makes a difference on filesystems with ascii case-insensitive
+lookups enabled.
 .TP
 .BI "hashcoll [-a] [-s seed] [-n " nr "] [-p " path "] -i | " names...
 Create directory entries or extended attributes names that all have the same


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCHSET 0/3] xfs_db: create names with colliding hashes
  2023-06-05 15:36 [PATCHSET 0/3] xfs_db: create names with colliding hashes Darrick J. Wong
                   ` (2 preceding siblings ...)
  2023-06-05 15:37 ` [PATCH 3/3] xfs_db: make the hash command print the dirent hash Darrick J. Wong
@ 2023-06-05 20:22 ` Andrey Albershteyn
  3 siblings, 0 replies; 11+ messages in thread
From: Andrey Albershteyn @ 2023-06-05 20:22 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: cem, linux-xfs

On 2023-06-05 08:36:58, Darrick J. Wong wrote:
> Hi all,
> 
> While we're on the topic of directory entry naming, create a couple of
> new debugger commands to create directory or xattr names that have the
> same name hash.  This enables further testing of that aspect of the
> dabtree code, which in turn enables us to perform worst case performance
> analysis of the parent pointers code.
> 
> If you're going to start using this mess, you probably ought to just
> pull from my git trees, which are linked below.
> 
> This is an extraordinary way to destroy everything.  Enjoy!
> Comments and questions are, as always, welcome.
> 
> --D
> 
> xfsprogs git tree:
> https://git.kernel.org/cgit/linux/kernel/git/djwong/xfsprogs-dev.git/log/?h=db-hash-collisions
> ---
>  db/Makefile       |    2 
>  db/hash.c         |  418 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  db/metadump.c     |  383 -------------------------------------------------
>  db/obfuscate.c    |  389 +++++++++++++++++++++++++++++++++++++++++++++++++
>  db/obfuscate.h    |   17 ++
>  man/man8/xfs_db.8 |   39 +++++
>  6 files changed, 859 insertions(+), 389 deletions(-)
>  create mode 100644 db/obfuscate.c
>  create mode 100644 db/obfuscate.h
> 

This patchset looks good to me. Seems to work.
Reviewed-by: Andrey Albershteyn <aalbersh@redhat.com>

-- 
- Andrey


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/3] xfs_db: hoist name obfuscation code out of metadump.c
  2023-06-05 15:37 ` [PATCH 1/3] xfs_db: hoist name obfuscation code out of metadump.c Darrick J. Wong
@ 2023-06-06 11:17   ` Carlos Maiolino
  2023-06-15 16:12   ` [PATCH v2 " Darrick J. Wong
  1 sibling, 0 replies; 11+ messages in thread
From: Carlos Maiolino @ 2023-06-06 11:17 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Mon, Jun 05, 2023 at 08:37:04AM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> We want to create a debugger command that will create obfuscated names
> for directory and xattr names, so hoist the name obfuscation code into a
> separate file.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>

Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>

> ---
>  db/Makefile    |    2
>  db/metadump.c  |  383 -------------------------------------------------------
>  db/obfuscate.c |  389 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  db/obfuscate.h |   17 ++
>  4 files changed, 408 insertions(+), 383 deletions(-)
>  create mode 100644 db/obfuscate.c
>  create mode 100644 db/obfuscate.h
> 
> 
> diff --git a/db/Makefile b/db/Makefile
> index b2e01174571..2f95f67075d 100644
> --- a/db/Makefile
> +++ b/db/Makefile
> @@ -13,7 +13,7 @@ HFILES = addr.h agf.h agfl.h agi.h attr.h attrshort.h bit.h block.h bmap.h \
>  	flist.h fprint.h frag.h freesp.h hash.h help.h init.h inode.h input.h \
>  	io.h logformat.h malloc.h metadump.h output.h print.h quit.h sb.h \
>  	sig.h strvec.h text.h type.h write.h attrset.h symlink.h fsmap.h \
> -	fuzz.h
> +	fuzz.h obfuscate.h
>  CFILES = $(HFILES:.h=.c) btdump.c btheight.c convert.c info.c namei.c \
>  	timelimit.c
>  LSRCFILES = xfs_admin.sh xfs_ncheck.sh xfs_metadump.sh
> diff --git a/db/metadump.c b/db/metadump.c
> index 4f8b3adb163..d9a616a9296 100644
> --- a/db/metadump.c
> +++ b/db/metadump.c
> @@ -19,6 +19,7 @@
>  #include "faddr.h"
>  #include "field.h"
>  #include "dir2.h"
> +#include "obfuscate.h"
> 
>  #define DEFAULT_MAX_EXT_SIZE	XFS_MAX_BMBT_EXTLEN
> 
> @@ -736,19 +737,6 @@ nametable_add(xfs_dahash_t hash, int namelen, unsigned char *name)
>  	return ent;
>  }
> 
> -#define is_invalid_char(c)	((c) == '/' || (c) == '\0')
> -#define rol32(x,y)		(((x) << (y)) | ((x) >> (32 - (y))))
> -
> -static inline unsigned char
> -random_filename_char(void)
> -{
> -	static unsigned char filename_alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
> -						"abcdefghijklmnopqrstuvwxyz"
> -						"0123456789-_";
> -
> -	return filename_alphabet[random() % (sizeof filename_alphabet - 1)];
> -}
> -
>  #define	ORPHANAGE	"lost+found"
>  #define	ORPHANAGE_LEN	(sizeof (ORPHANAGE) - 1)
> 
> @@ -808,375 +796,6 @@ in_lost_found(
>  	return slen == namelen && !memcmp(name, s, namelen);
>  }
> 
> -/*
> - * Given a name and its hash value, massage the name in such a way
> - * that the result is another name of equal length which shares the
> - * same hash value.
> - */
> -static void
> -obfuscate_name(
> -	xfs_dahash_t	hash,
> -	size_t		name_len,
> -	unsigned char	*name,
> -	bool		is_dirent)
> -{
> -	unsigned char	*oldname = NULL;
> -	unsigned char	*newp;
> -	int		i;
> -	xfs_dahash_t	new_hash;
> -	unsigned char	*first;
> -	unsigned char	high_bit;
> -	int		tries = 0;
> -	bool		is_ci_name = is_dirent && xfs_has_asciici(mp);
> -	int		shift;
> -
> -	/*
> -	 * Our obfuscation algorithm requires at least 5-character
> -	 * names, so don't bother if the name is too short.  We
> -	 * work backward from a hash value to determine the last
> -	 * five bytes in a name required to produce a new name
> -	 * with the same hash.
> -	 */
> -	if (name_len < 5)
> -		return;
> -
> -	if (is_ci_name) {
> -		oldname = alloca(name_len);
> -		memcpy(oldname, name, name_len);
> -	}
> -
> -again:
> -	newp = name;
> -	new_hash = 0;
> -
> -	/*
> -	 * If we cannot generate a ci-compatible obfuscated name after 1000
> -	 * tries, don't bother obfuscating the name.
> -	 */
> -	if (tries++ > 1000) {
> -		memcpy(name, oldname, name_len);
> -		return;
> -	}
> -
> -	/*
> -	 * The beginning of the obfuscated name can be pretty much
> -	 * anything, so fill it in with random characters.
> -	 * Accumulate its new hash value as we go.
> -	 */
> -	for (i = 0; i < name_len - 5; i++) {
> -		*newp = random_filename_char();
> -		if (is_ci_name)
> -			new_hash = xfs_ascii_ci_xfrm(*newp) ^
> -							rol32(new_hash, 7);
> -		else
> -			new_hash = *newp ^ rol32(new_hash, 7);
> -		newp++;
> -	}
> -
> -	/*
> -	 * Compute which five bytes need to be used at the end of
> -	 * the name so the hash of the obfuscated name is the same
> -	 * as the hash of the original.  If any result in an invalid
> -	 * character, flip a bit and arrange for a corresponding bit
> -	 * in a neighboring byte to be flipped as well.  For the
> -	 * last byte, the "neighbor" to change is the first byte
> -	 * we're computing here.
> -	 */
> -	new_hash = rol32(new_hash, 3) ^ hash;
> -
> -	first = newp;
> -	high_bit = 0;
> -	for (shift = 28; shift >= 0; shift -= 7) {
> -		*newp = (new_hash >> shift & 0x7f) ^ high_bit;
> -		if (is_invalid_char(*newp)) {
> -			*newp ^= 1;
> -			high_bit = 0x80;
> -		} else
> -			high_bit = 0;
> -
> -		/*
> -		 * If ascii-ci is enabled, uppercase characters are converted
> -		 * to lowercase characters while computing the name hash.  If
> -		 * any of the necessary correction bytes are uppercase, the
> -		 * hash of the new name will not match.  Try again with a
> -		 * different prefix.
> -		 */
> -		if (is_ci_name && xfs_ascii_ci_need_xfrm(*newp))
> -			goto again;
> -
> -		ASSERT(!is_invalid_char(*newp));
> -		newp++;
> -	}
> -
> -	/*
> -	 * If we flipped a bit on the last byte, we need to fix up
> -	 * the matching bit in the first byte.  The result will
> -	 * be a valid character, because we know that first byte
> -	 * has 0's in its upper four bits (it was produced by a
> -	 * 28-bit right-shift of a 32-bit unsigned value).
> -	 */
> -	if (high_bit) {
> -		*first ^= 0x10;
> -
> -		if (is_ci_name && xfs_ascii_ci_need_xfrm(*first))
> -			goto again;
> -
> -		ASSERT(!is_invalid_char(*first));
> -	}
> -}
> -
> -/*
> - * Flip a bit in each of two bytes at the end of the given name.
> - * This is used in generating a series of alternate names to be used
> - * in the event a duplicate is found.
> - *
> - * The bits flipped are selected such that they both affect the same
> - * bit in the name's computed hash value, so flipping them both will
> - * preserve the hash.
> - *
> - * The following diagram aims to show the portion of a computed
> - * hash that a given byte of a name affects.
> - *
> - *	   31    28      24    21	     14		  8 7       3     0
> - *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
> - * hash:   | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
> - *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
> - *	  last-4 ->|	       |<-- last-2 --->|	   |<--- last ---->|
> - *		 |<-- last-3 --->|	     |<-- last-1 --->|     |<- last-4
> - *			 |<-- last-7 --->|	     |<-- last-5 --->|
> - *	   |<-- last-8 --->|	       |<-- last-6 --->|
> - *			. . . and so on
> - *
> - * The last byte of the name directly affects the low-order byte of
> - * the hash.  The next-to-last affects bits 7-14, the next one back
> - * affects bits 14-21, and so on.  The effect wraps around when it
> - * goes beyond the top of the hash (as happens for byte last-4).
> - *
> - * Bits that are flipped together "overlap" on the hash value.  As
> - * an example of overlap, the last two bytes both affect bit 7 in
> - * the hash.  That pair of bytes (and their overlapping bits) can be
> - * used for this "flip bit" operation (it's the first pair tried,
> - * actually).
> - *
> - * A table defines overlapping pairs--the bytes involved and bits
> - * within them--that can be used this way.  The byte offset is
> - * relative to a starting point within the name, which will be set
> - * to affect the bytes at the end of the name.  The function is
> - * called with a "bitseq" value which indicates which bit flip is
> - * desired, and this translates directly into selecting which entry
> - * in the bit_to_flip[] table to apply.
> - *
> - * The function returns 1 if the operation was successful.  It
> - * returns 0 if the result produced a character that's not valid in
> - * a name (either '/' or a '\0').  Finally, it returns -1 if the bit
> - * sequence number is beyond what is supported for a name of this
> - * length.
> - *
> - * Discussion
> - * ----------
> - * (Also see the discussion above find_alternate(), below.)
> - *
> - * In order to make this function work for any length name, the
> - * table is ordered by increasing byte offset, so that the earliest
> - * entries can apply to the shortest strings.  This way all names
> - * are done consistently.
> - *
> - * When bit flips occur, they can convert printable characters
> - * into non-printable ones.  In an effort to reduce the impact of
> - * this, the first bit flips are chosen to affect bytes the end of
> - * the name (and furthermore, toward the low bits of a byte).  Those
> - * bytes are often non-printable anyway because of the way they are
> - * initially selected by obfuscate_name()).  This is accomplished,
> - * using later table entries first.
> - *
> - * Each row in the table doubles the number of alternates that
> - * can be generated.  A two-byte name is limited to using only
> - * the first row, so it's possible to generate two alternates
> - * (the original name, plus the alternate produced by flipping
> - * the one pair of bits).  In a 5-byte name, the effect of the
> - * first byte overlaps the last by 4 its, and there are 8 bits
> - * to flip, allowing for 256 possible alternates.
> - *
> - * Short names (less than 5 bytes) are never even obfuscated, so for
> - * such names the relatively small number of alternates should never
> - * really be a problem.
> - *
> - * Long names (more than 6 bytes, say) are not likely to exhaust
> - * the number of available alternates.  In fact, the table could
> - * probably have stopped at 8 entries, on the assumption that 256
> - * alternates should be enough for most any situation.  The entries
> - * beyond those are present mostly for demonstration of how it could
> - * be populated with more entries, should it ever be necessary to do
> - * so.
> - */
> -static int
> -flip_bit(
> -	size_t		name_len,
> -	unsigned char	*name,
> -	uint32_t	bitseq)
> -{
> -	int	index;
> -	size_t	offset;
> -	unsigned char *p0, *p1;
> -	unsigned char m0, m1;
> -	struct {
> -	    int		byte;	/* Offset from start within name */
> -	    unsigned char bit;	/* Bit within that byte */
> -	} bit_to_flip[][2] = {	/* Sorted by second entry's byte */
> -	    { { 0, 0 }, { 1, 7 } },	/* Each row defines a pair */
> -	    { { 1, 0 }, { 2, 7 } },	/* of bytes and a bit within */
> -	    { { 2, 0 }, { 3, 7 } },	/* each byte.  Each bit in */
> -	    { { 0, 4 }, { 4, 0 } },	/* a pair affects the same */
> -	    { { 0, 5 }, { 4, 1 } },	/* bit in the hash, so flipping */
> -	    { { 0, 6 }, { 4, 2 } },	/* both will change the name */
> -	    { { 0, 7 }, { 4, 3 } },	/* while preserving the hash. */
> -	    { { 3, 0 }, { 4, 7 } },
> -	    { { 0, 0 }, { 5, 3 } },	/* The first entry's byte offset */
> -	    { { 0, 1 }, { 5, 4 } },	/* must be less than the second. */
> -	    { { 0, 2 }, { 5, 5 } },
> -	    { { 0, 3 }, { 5, 6 } },	/* The table can be extended to */
> -	    { { 0, 4 }, { 5, 7 } },	/* an arbitrary number of entries */
> -	    { { 4, 0 }, { 5, 7 } },	/* but there's not much point. */
> -		/* . . . */
> -	};
> -
> -	/* Find the first entry *not* usable for name of this length */
> -
> -	for (index = 0; index < ARRAY_SIZE(bit_to_flip); index++)
> -		if (bit_to_flip[index][1].byte >= name_len)
> -			break;
> -
> -	/*
> -	 * Back up to the last usable entry.  If that number is
> -	 * smaller than the bit sequence number, inform the caller
> -	 * that nothing this large (or larger) will work.
> -	 */
> -	if (bitseq > --index)
> -		return -1;
> -
> -	/*
> -	 * We will be switching bits at the end of name, with a
> -	 * preference for affecting the last bytes first.  Compute
> -	 * where in the name we'll start applying the changes.
> -	 */
> -	offset = name_len - (bit_to_flip[index][1].byte + 1);
> -	index -= bitseq;	/* Use later table entries first */
> -
> -	p0 = name + offset + bit_to_flip[index][0].byte;
> -	p1 = name + offset + bit_to_flip[index][1].byte;
> -	m0 = 1 << bit_to_flip[index][0].bit;
> -	m1 = 1 << bit_to_flip[index][1].bit;
> -
> -	/* Only change the bytes if it produces valid characters */
> -
> -	if (is_invalid_char(*p0 ^ m0) || is_invalid_char(*p1 ^ m1))
> -		return 0;
> -
> -	*p0 ^= m0;
> -	*p1 ^= m1;
> -
> -	return 1;
> -}
> -
> -/*
> - * This function generates a well-defined sequence of "alternate"
> - * names for a given name.  An alternate is a name having the same
> - * length and same hash value as the original name.  This is needed
> - * because the algorithm produces only one obfuscated name to use
> - * for a given original name, and it's possible that result matches
> - * a name already seen.  This function checks for this, and if it
> - * occurs, finds another suitable obfuscated name to use.
> - *
> - * Each bit in the binary representation of the sequence number is
> - * used to select one possible "bit flip" operation to perform on
> - * the name.  So for example:
> - *    seq = 0:	selects no bits to flip
> - *    seq = 1:	selects the 0th bit to flip
> - *    seq = 2:	selects the 1st bit to flip
> - *    seq = 3:	selects the 0th and 1st bit to flip
> - *    ... and so on.
> - *
> - * The flip_bit() function takes care of the details of the bit
> - * flipping within the name.  Note that the "1st bit" in this
> - * context is a bit sequence number; i.e. it doesn't necessarily
> - * mean bit 0x02 will be changed.
> - *
> - * If a valid name (one that contains no '/' or '\0' characters) is
> - * produced by this process for the given sequence number, this
> - * function returns 1.  If the result is not valid, it returns 0.
> - * Returns -1 if the sequence number is beyond the the maximum for
> - * names of the given length.
> - *
> - *
> - * Discussion
> - * ----------
> - * The number of alternates available for a given name is dependent
> - * on its length.  A "bit flip" involves inverting two bits in
> - * a name--the two bits being selected such that their values
> - * affect the name's hash value in the same way.  Alternates are
> - * thus generated by inverting the value of pairs of such
> - * "overlapping" bits in the original name.  Each byte after the
> - * first in a name adds at least one bit of overlap to work with.
> - * (See comments above flip_bit() for more discussion on this.)
> - *
> - * So the number of alternates is dependent on the number of such
> - * overlapping bits in a name.  If there are N bit overlaps, there
> - * 2^N alternates for that hash value.
> - *
> - * Here are the number of overlapping bits available for generating
> - * alternates for names of specific lengths:
> - *	1	0	(must have 2 bytes to have any overlap)
> - *	2	1	One bit overlaps--so 2 possible alternates
> - *	3	2	Two bits overlap--so 4 possible alternates
> - *	4	4	Three bits overlap, so 2^3 alternates
> - *	5	8	8 bits overlap (due to wrapping), 256 alternates
> - *	6	18	2^18 alternates
> - *	7	28	2^28 alternates
> - *	   ...
> - * It's clear that the number of alternates grows very quickly with
> - * the length of the name.  But note that the set of alternates
> - * includes invalid names.  And for certain (contrived) names, the
> - * number of valid names is a fairly small fraction of the total
> - * number of alternates.
> - *
> - * The main driver for this infrastructure for coming up with
> - * alternate names is really related to names 5 (or possibly 6)
> - * bytes in length.  5-byte obfuscated names contain no randomly-
> - * generated bytes in them, and the chance of an obfuscated name
> - * matching an already-seen name is too high to just ignore.  This
> - * methodical selection of alternates ensures we don't produce
> - * duplicate names unless we have exhausted our options.
> - */
> -static int
> -find_alternate(
> -	size_t		name_len,
> -	unsigned char	*name,
> -	uint32_t	seq)
> -{
> -	uint32_t	bitseq = 0;
> -	uint32_t	bits = seq;
> -
> -	if (!seq)
> -		return 1;	/* alternate 0 is the original name */
> -	if (name_len < 2)	/* Must have 2 bytes to flip */
> -		return -1;
> -
> -	for (bitseq = 0; bits; bitseq++) {
> -		uint32_t	mask = 1 << bitseq;
> -		int		fb;
> -
> -		if (!(bits & mask))
> -			continue;
> -
> -		fb = flip_bit(name_len, name, bitseq);
> -		if (fb < 1)
> -			return fb ? -1 : 0;
> -		bits ^= mask;
> -	}
> -
> -	return 1;
> -}
> -
>  /*
>   * Look up the given name in the name table.  If it is already
>   * present, iterate through a well-defined sequence of alternate
> diff --git a/db/obfuscate.c b/db/obfuscate.c
> new file mode 100644
> index 00000000000..249f22b52ce
> --- /dev/null
> +++ b/db/obfuscate.c
> @@ -0,0 +1,389 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2007, 2011 SGI
> + * All Rights Reserved.
> + */
> +#include "libxfs.h"
> +#include "init.h"
> +#include "obfuscate.h"
> +
> +static inline unsigned char
> +random_filename_char(void)
> +{
> +	static unsigned char filename_alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
> +						"abcdefghijklmnopqrstuvwxyz"
> +						"0123456789-_";
> +
> +	return filename_alphabet[random() % (sizeof filename_alphabet - 1)];
> +}
> +
> +#define rol32(x,y)		(((x) << (y)) | ((x) >> (32 - (y))))
> +
> +/*
> + * Given a name and its hash value, massage the name in such a way
> + * that the result is another name of equal length which shares the
> + * same hash value.
> + */
> +void
> +obfuscate_name(
> +	xfs_dahash_t	hash,
> +	size_t		name_len,
> +	unsigned char	*name,
> +	bool		is_dirent)
> +{
> +	unsigned char	*oldname = NULL;
> +	unsigned char	*newp;
> +	int		i;
> +	xfs_dahash_t	new_hash;
> +	unsigned char	*first;
> +	unsigned char	high_bit;
> +	int		tries = 0;
> +	bool		is_ci_name = is_dirent && xfs_has_asciici(mp);
> +	int		shift;
> +
> +	/*
> +	 * Our obfuscation algorithm requires at least 5-character
> +	 * names, so don't bother if the name is too short.  We
> +	 * work backward from a hash value to determine the last
> +	 * five bytes in a name required to produce a new name
> +	 * with the same hash.
> +	 */
> +	if (name_len < 5)
> +		return;
> +
> +	if (is_ci_name) {
> +		oldname = alloca(name_len);
> +		memcpy(oldname, name, name_len);
> +	}
> +
> +again:
> +	newp = name;
> +	new_hash = 0;
> +
> +	/*
> +	 * If we cannot generate a ci-compatible obfuscated name after 1000
> +	 * tries, don't bother obfuscating the name.
> +	 */
> +	if (tries++ > 1000) {
> +		memcpy(name, oldname, name_len);
> +		return;
> +	}
> +
> +	/*
> +	 * The beginning of the obfuscated name can be pretty much
> +	 * anything, so fill it in with random characters.
> +	 * Accumulate its new hash value as we go.
> +	 */
> +	for (i = 0; i < name_len - 5; i++) {
> +		*newp = random_filename_char();
> +		if (is_ci_name)
> +			new_hash = xfs_ascii_ci_xfrm(*newp) ^
> +							rol32(new_hash, 7);
> +		else
> +			new_hash = *newp ^ rol32(new_hash, 7);
> +		newp++;
> +	}
> +
> +	/*
> +	 * Compute which five bytes need to be used at the end of
> +	 * the name so the hash of the obfuscated name is the same
> +	 * as the hash of the original.  If any result in an invalid
> +	 * character, flip a bit and arrange for a corresponding bit
> +	 * in a neighboring byte to be flipped as well.  For the
> +	 * last byte, the "neighbor" to change is the first byte
> +	 * we're computing here.
> +	 */
> +	new_hash = rol32(new_hash, 3) ^ hash;
> +
> +	first = newp;
> +	high_bit = 0;
> +	for (shift = 28; shift >= 0; shift -= 7) {
> +		*newp = (new_hash >> shift & 0x7f) ^ high_bit;
> +		if (is_invalid_char(*newp)) {
> +			*newp ^= 1;
> +			high_bit = 0x80;
> +		} else
> +			high_bit = 0;
> +
> +		/*
> +		 * If ascii-ci is enabled, uppercase characters are converted
> +		 * to lowercase characters while computing the name hash.  If
> +		 * any of the necessary correction bytes are uppercase, the
> +		 * hash of the new name will not match.  Try again with a
> +		 * different prefix.
> +		 */
> +		if (is_ci_name && xfs_ascii_ci_need_xfrm(*newp))
> +			goto again;
> +
> +		ASSERT(!is_invalid_char(*newp));
> +		newp++;
> +	}
> +
> +	/*
> +	 * If we flipped a bit on the last byte, we need to fix up
> +	 * the matching bit in the first byte.  The result will
> +	 * be a valid character, because we know that first byte
> +	 * has 0's in its upper four bits (it was produced by a
> +	 * 28-bit right-shift of a 32-bit unsigned value).
> +	 */
> +	if (high_bit) {
> +		*first ^= 0x10;
> +
> +		if (is_ci_name && xfs_ascii_ci_need_xfrm(*first))
> +			goto again;
> +
> +		ASSERT(!is_invalid_char(*first));
> +	}
> +}
> +
> +/*
> + * Flip a bit in each of two bytes at the end of the given name.
> + * This is used in generating a series of alternate names to be used
> + * in the event a duplicate is found.
> + *
> + * The bits flipped are selected such that they both affect the same
> + * bit in the name's computed hash value, so flipping them both will
> + * preserve the hash.
> + *
> + * The following diagram aims to show the portion of a computed
> + * hash that a given byte of a name affects.
> + *
> + *	   31    28      24    21	     14		  8 7       3     0
> + *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
> + * hash:   | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
> + *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
> + *	  last-4 ->|	       |<-- last-2 --->|	   |<--- last ---->|
> + *		 |<-- last-3 --->|	     |<-- last-1 --->|     |<- last-4
> + *			 |<-- last-7 --->|	     |<-- last-5 --->|
> + *	   |<-- last-8 --->|	       |<-- last-6 --->|
> + *			. . . and so on
> + *
> + * The last byte of the name directly affects the low-order byte of
> + * the hash.  The next-to-last affects bits 7-14, the next one back
> + * affects bits 14-21, and so on.  The effect wraps around when it
> + * goes beyond the top of the hash (as happens for byte last-4).
> + *
> + * Bits that are flipped together "overlap" on the hash value.  As
> + * an example of overlap, the last two bytes both affect bit 7 in
> + * the hash.  That pair of bytes (and their overlapping bits) can be
> + * used for this "flip bit" operation (it's the first pair tried,
> + * actually).
> + *
> + * A table defines overlapping pairs--the bytes involved and bits
> + * within them--that can be used this way.  The byte offset is
> + * relative to a starting point within the name, which will be set
> + * to affect the bytes at the end of the name.  The function is
> + * called with a "bitseq" value which indicates which bit flip is
> + * desired, and this translates directly into selecting which entry
> + * in the bit_to_flip[] table to apply.
> + *
> + * The function returns 1 if the operation was successful.  It
> + * returns 0 if the result produced a character that's not valid in
> + * a name (either '/' or a '\0').  Finally, it returns -1 if the bit
> + * sequence number is beyond what is supported for a name of this
> + * length.
> + *
> + * Discussion
> + * ----------
> + * (Also see the discussion above find_alternate(), below.)
> + *
> + * In order to make this function work for any length name, the
> + * table is ordered by increasing byte offset, so that the earliest
> + * entries can apply to the shortest strings.  This way all names
> + * are done consistently.
> + *
> + * When bit flips occur, they can convert printable characters
> + * into non-printable ones.  In an effort to reduce the impact of
> + * this, the first bit flips are chosen to affect bytes the end of
> + * the name (and furthermore, toward the low bits of a byte).  Those
> + * bytes are often non-printable anyway because of the way they are
> + * initially selected by obfuscate_name()).  This is accomplished,
> + * using later table entries first.
> + *
> + * Each row in the table doubles the number of alternates that
> + * can be generated.  A two-byte name is limited to using only
> + * the first row, so it's possible to generate two alternates
> + * (the original name, plus the alternate produced by flipping
> + * the one pair of bits).  In a 5-byte name, the effect of the
> + * first byte overlaps the last by 4 its, and there are 8 bits
> + * to flip, allowing for 256 possible alternates.
> + *
> + * Short names (less than 5 bytes) are never even obfuscated, so for
> + * such names the relatively small number of alternates should never
> + * really be a problem.
> + *
> + * Long names (more than 6 bytes, say) are not likely to exhaust
> + * the number of available alternates.  In fact, the table could
> + * probably have stopped at 8 entries, on the assumption that 256
> + * alternates should be enough for most any situation.  The entries
> + * beyond those are present mostly for demonstration of how it could
> + * be populated with more entries, should it ever be necessary to do
> + * so.
> + */
> +static int
> +flip_bit(
> +	size_t		name_len,
> +	unsigned char	*name,
> +	uint32_t	bitseq)
> +{
> +	int	index;
> +	size_t	offset;
> +	unsigned char *p0, *p1;
> +	unsigned char m0, m1;
> +	struct {
> +	    int		byte;	/* Offset from start within name */
> +	    unsigned char bit;	/* Bit within that byte */
> +	} bit_to_flip[][2] = {	/* Sorted by second entry's byte */
> +	    { { 0, 0 }, { 1, 7 } },	/* Each row defines a pair */
> +	    { { 1, 0 }, { 2, 7 } },	/* of bytes and a bit within */
> +	    { { 2, 0 }, { 3, 7 } },	/* each byte.  Each bit in */
> +	    { { 0, 4 }, { 4, 0 } },	/* a pair affects the same */
> +	    { { 0, 5 }, { 4, 1 } },	/* bit in the hash, so flipping */
> +	    { { 0, 6 }, { 4, 2 } },	/* both will change the name */
> +	    { { 0, 7 }, { 4, 3 } },	/* while preserving the hash. */
> +	    { { 3, 0 }, { 4, 7 } },
> +	    { { 0, 0 }, { 5, 3 } },	/* The first entry's byte offset */
> +	    { { 0, 1 }, { 5, 4 } },	/* must be less than the second. */
> +	    { { 0, 2 }, { 5, 5 } },
> +	    { { 0, 3 }, { 5, 6 } },	/* The table can be extended to */
> +	    { { 0, 4 }, { 5, 7 } },	/* an arbitrary number of entries */
> +	    { { 4, 0 }, { 5, 7 } },	/* but there's not much point. */
> +		/* . . . */
> +	};
> +
> +	/* Find the first entry *not* usable for name of this length */
> +
> +	for (index = 0; index < ARRAY_SIZE(bit_to_flip); index++)
> +		if (bit_to_flip[index][1].byte >= name_len)
> +			break;
> +
> +	/*
> +	 * Back up to the last usable entry.  If that number is
> +	 * smaller than the bit sequence number, inform the caller
> +	 * that nothing this large (or larger) will work.
> +	 */
> +	if (bitseq > --index)
> +		return -1;
> +
> +	/*
> +	 * We will be switching bits at the end of name, with a
> +	 * preference for affecting the last bytes first.  Compute
> +	 * where in the name we'll start applying the changes.
> +	 */
> +	offset = name_len - (bit_to_flip[index][1].byte + 1);
> +	index -= bitseq;	/* Use later table entries first */
> +
> +	p0 = name + offset + bit_to_flip[index][0].byte;
> +	p1 = name + offset + bit_to_flip[index][1].byte;
> +	m0 = 1 << bit_to_flip[index][0].bit;
> +	m1 = 1 << bit_to_flip[index][1].bit;
> +
> +	/* Only change the bytes if it produces valid characters */
> +
> +	if (is_invalid_char(*p0 ^ m0) || is_invalid_char(*p1 ^ m1))
> +		return 0;
> +
> +	*p0 ^= m0;
> +	*p1 ^= m1;
> +
> +	return 1;
> +}
> +
> +/*
> + * This function generates a well-defined sequence of "alternate"
> + * names for a given name.  An alternate is a name having the same
> + * length and same hash value as the original name.  This is needed
> + * because the algorithm produces only one obfuscated name to use
> + * for a given original name, and it's possible that result matches
> + * a name already seen.  This function checks for this, and if it
> + * occurs, finds another suitable obfuscated name to use.
> + *
> + * Each bit in the binary representation of the sequence number is
> + * used to select one possible "bit flip" operation to perform on
> + * the name.  So for example:
> + *    seq = 0:	selects no bits to flip
> + *    seq = 1:	selects the 0th bit to flip
> + *    seq = 2:	selects the 1st bit to flip
> + *    seq = 3:	selects the 0th and 1st bit to flip
> + *    ... and so on.
> + *
> + * The flip_bit() function takes care of the details of the bit
> + * flipping within the name.  Note that the "1st bit" in this
> + * context is a bit sequence number; i.e. it doesn't necessarily
> + * mean bit 0x02 will be changed.
> + *
> + * If a valid name (one that contains no '/' or '\0' characters) is
> + * produced by this process for the given sequence number, this
> + * function returns 1.  If the result is not valid, it returns 0.
> + * Returns -1 if the sequence number is beyond the the maximum for
> + * names of the given length.
> + *
> + *
> + * Discussion
> + * ----------
> + * The number of alternates available for a given name is dependent
> + * on its length.  A "bit flip" involves inverting two bits in
> + * a name--the two bits being selected such that their values
> + * affect the name's hash value in the same way.  Alternates are
> + * thus generated by inverting the value of pairs of such
> + * "overlapping" bits in the original name.  Each byte after the
> + * first in a name adds at least one bit of overlap to work with.
> + * (See comments above flip_bit() for more discussion on this.)
> + *
> + * So the number of alternates is dependent on the number of such
> + * overlapping bits in a name.  If there are N bit overlaps, there
> + * 2^N alternates for that hash value.
> + *
> + * Here are the number of overlapping bits available for generating
> + * alternates for names of specific lengths:
> + *	1	0	(must have 2 bytes to have any overlap)
> + *	2	1	One bit overlaps--so 2 possible alternates
> + *	3	2	Two bits overlap--so 4 possible alternates
> + *	4	4	Three bits overlap, so 2^3 alternates
> + *	5	8	8 bits overlap (due to wrapping), 256 alternates
> + *	6	18	2^18 alternates
> + *	7	28	2^28 alternates
> + *	   ...
> + * It's clear that the number of alternates grows very quickly with
> + * the length of the name.  But note that the set of alternates
> + * includes invalid names.  And for certain (contrived) names, the
> + * number of valid names is a fairly small fraction of the total
> + * number of alternates.
> + *
> + * The main driver for this infrastructure for coming up with
> + * alternate names is really related to names 5 (or possibly 6)
> + * bytes in length.  5-byte obfuscated names contain no randomly-
> + * generated bytes in them, and the chance of an obfuscated name
> + * matching an already-seen name is too high to just ignore.  This
> + * methodical selection of alternates ensures we don't produce
> + * duplicate names unless we have exhausted our options.
> + */
> +int
> +find_alternate(
> +	size_t		name_len,
> +	unsigned char	*name,
> +	uint32_t	seq)
> +{
> +	uint32_t	bitseq = 0;
> +	uint32_t	bits = seq;
> +
> +	if (!seq)
> +		return 1;	/* alternate 0 is the original name */
> +	if (name_len < 2)	/* Must have 2 bytes to flip */
> +		return -1;
> +
> +	for (bitseq = 0; bits; bitseq++) {
> +		uint32_t	mask = 1 << bitseq;
> +		int		fb;
> +
> +		if (!(bits & mask))
> +			continue;
> +
> +		fb = flip_bit(name_len, name, bitseq);
> +		if (fb < 1)
> +			return fb ? -1 : 0;
> +		bits ^= mask;
> +	}
> +
> +	return 1;
> +}
> diff --git a/db/obfuscate.h b/db/obfuscate.h
> new file mode 100644
> index 00000000000..afaaca37154
> --- /dev/null
> +++ b/db/obfuscate.h
> @@ -0,0 +1,17 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2007, 2011 SGI
> + * All Rights Reserved.
> + */
> +#ifndef __DB_OBFUSCATE_H__
> +#define __DB_OBFUSCATE_H__
> +
> +/* Routines to obfuscate directory filenames and xattr names. */
> +
> +#define is_invalid_char(c)	((c) == '/' || (c) == '\0')
> +
> +void obfuscate_name(xfs_dahash_t hash, size_t name_len, unsigned char *name,
> +		bool is_dirent);
> +int find_alternate(size_t name_len, unsigned char *name, uint32_t seq);
> +
> +#endif /* __DB_OBFUSCATE_H__ */
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/3] xfs_db: create dirents and xattrs with colliding names
  2023-06-05 15:37 ` [PATCH 2/3] xfs_db: create dirents and xattrs with colliding names Darrick J. Wong
@ 2023-06-06 11:33   ` Carlos Maiolino
  2023-06-14 11:32   ` Carlos Maiolino
  1 sibling, 0 replies; 11+ messages in thread
From: Carlos Maiolino @ 2023-06-06 11:33 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Mon, Jun 05, 2023 at 08:37:09AM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> Create a new debugger command that will create dirent and xattr names
> that induce dahash collisions.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
>  db/hash.c         |  376 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  man/man8/xfs_db.8 |   31 ++++
>  2 files changed, 407 insertions(+)

Looks good, will test.

Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>

> 
> 
> diff --git a/db/hash.c b/db/hash.c
> index 68c53e7f9bc..79a250526e9 100644
> --- a/db/hash.c
> +++ b/db/hash.c
> @@ -5,12 +5,15 @@
>   */
> 
>  #include "libxfs.h"
> +#include "init.h"
>  #include "addr.h"
>  #include "command.h"
>  #include "type.h"
>  #include "io.h"
>  #include "output.h"
>  #include "hash.h"
> +#include "obfuscate.h"
> +#include <sys/xattr.h>
> 
>  static int hash_f(int argc, char **argv);
>  static void hash_help(void);
> @@ -46,8 +49,381 @@ hash_f(
>  	return 0;
>  }
> 
> +static void
> +hashcoll_help(void)
> +{
> +	printf(_(
> +"\n"
> +" Generate obfuscated variants of the provided name.  Each variant will have\n"
> +" the same dahash value.  Names are written to stdout with a NULL separating\n"
> +" each name.\n"
> +"\n"
> +" -a -- create extended attributes.\n"
> +" -i -- read standard input for the name, up to %d bytes.\n"
> +" -n -- create this many names.\n"
> +" -p -- create directory entries or extended attributes in this file.\n"
> +" -s -- seed the rng with this value.\n"
> +"\n"),
> +			MAXNAMELEN - 1);
> +}
> +
> +struct name_dup {
> +	struct name_dup	*next;
> +	uint32_t	crc;
> +	uint8_t		namelen;
> +	uint8_t		name[];
> +};
> +
> +static inline size_t
> +name_dup_sizeof(
> +	unsigned int	namelen)
> +{
> +	return sizeof(struct name_dup) + namelen;
> +}
> +
> +#define MAX_DUP_TABLE_BUCKETS	(1048575)
> +
> +struct dup_table {
> +	unsigned int	nr_buckets;
> +	struct name_dup	*buckets[];
> +};
> +
> +static inline size_t
> +dup_table_sizeof(
> +	unsigned int	nr_buckets)
> +{
> +	return sizeof(struct dup_table) +
> +				(nr_buckets * sizeof(struct name_dup *));
> +}
> +
> +static int
> +dup_table_alloc(
> +	unsigned long		nr_names,
> +	struct dup_table	**tabp)
> +{
> +	struct dup_table	*t;
> +
> +	*tabp = NULL;
> +
> +	if (nr_names == 1)
> +		return 0;
> +
> +	nr_names = min(MAX_DUP_TABLE_BUCKETS, nr_names);
> +	t = calloc(1, dup_table_sizeof(nr_names));
> +	if (!t)
> +		return ENOMEM;
> +
> +	t->nr_buckets = nr_names;
> +	*tabp = t;
> +	return 0;
> +}
> +
> +static void
> +dup_table_free(
> +	struct dup_table	*tab)
> +{
> +	struct name_dup		*ent, *next;
> +	unsigned int		i;
> +
> +	if (!tab)
> +		return;
> +
> +	for (i = 0; i < tab->nr_buckets; i++) {
> +		ent = tab->buckets[i];
> +
> +		while (ent) {
> +			next = ent->next;
> +			free(ent);
> +			ent = next;
> +		}
> +	}
> +	free(tab);
> +}
> +
> +static struct name_dup *
> +dup_table_find(
> +	struct dup_table	*tab,
> +	unsigned char		*name,
> +	size_t			namelen)
> +{
> +	struct name_dup		*ent;
> +	uint32_t		crc = crc32c(~0, name, namelen);
> +
> +	ent = tab->buckets[crc % tab->nr_buckets];
> +	while (ent) {
> +		if (ent->crc == crc &&
> +		    ent->namelen == namelen &&
> +		    !memcmp(ent->name, name, namelen))
> +			return ent;
> +
> +		ent = ent->next;
> +	}
> +
> +	return NULL;
> +}
> +
> +static int
> +dup_table_store(
> +	struct dup_table	*tab,
> +	unsigned char		*name,
> +	size_t			namelen)
> +{
> +	struct name_dup		*dup;
> +	uint32_t		seq = 1;
> +
> +	ASSERT(namelen < MAXNAMELEN);
> +
> +	while ((dup = dup_table_find(tab, name, namelen)) != NULL) {
> +		int		ret;
> +
> +		do {
> +			ret = find_alternate(namelen, name, seq++);
> +		} while (ret == 0);
> +		if (ret < 0)
> +			return EEXIST;
> +	}
> +
> +	dup = malloc(name_dup_sizeof(namelen));
> +	if (!dup)
> +		return ENOMEM;
> +
> +	dup->crc = crc32c(~0, name, namelen);
> +	dup->namelen = namelen;
> +	memcpy(dup->name, name, namelen);
> +	dup->next = tab->buckets[dup->crc % tab->nr_buckets];
> +
> +	tab->buckets[dup->crc % tab->nr_buckets] = dup;
> +	return 0;
> +}
> +
> +static int
> +collide_dirents(
> +	unsigned long		nr,
> +	const unsigned char	*name,
> +	size_t			namelen,
> +	int			fd)
> +{
> +	struct xfs_name		dname = {
> +		.name		= name,
> +		.len		= namelen,
> +	};
> +	unsigned char		direntname[MAXNAMELEN + 1];
> +	struct dup_table	*tab = NULL;
> +	xfs_dahash_t		old_hash;
> +	unsigned long		i;
> +	int			error = 0;
> +
> +	old_hash = libxfs_dir2_hashname(mp, &dname);
> +
> +	if (fd >= 0) {
> +		int		newfd;
> +
> +		/*
> +		 * User passed in a fd, so we'll use the directory to detect
> +		 * duplicate names.  First create the name that we are passed
> +		 * in; the new names will be hardlinks to the first file.
> +		 */
> +		newfd = openat(fd, name, O_CREAT, 0600);
> +		if (newfd < 0)
> +			return errno;
> +		close(newfd);
> +	} else if (nr > 1) {
> +		/*
> +		 * Track every name we create so that we don't emit duplicates.
> +		 */
> +		error = dup_table_alloc(nr, &tab);
> +		if (error)
> +			return error;
> +	}
> +
> +	dname.name = direntname;
> +	for (i = 0; i < nr; i++) {
> +		strncpy(direntname, name, MAXNAMELEN);
> +		obfuscate_name(old_hash, namelen, direntname, true);
> +		ASSERT(old_hash == libxfs_dir2_hashname(mp, &dname));
> +
> +		if (fd >= 0) {
> +			error = linkat(fd, name, fd, direntname, 0);
> +			if (error && errno != EEXIST)
> +				return errno;
> +
> +			/* don't print names to stdout */
> +			continue;
> +		} else if (tab) {
> +			error = dup_table_store(tab, direntname, namelen);
> +			if (error)
> +				break;
> +		}
> +
> +		printf("%s%c", direntname, 0);
> +	}
> +
> +	dup_table_free(tab);
> +	return error;
> +}
> +
> +static int
> +collide_xattrs(
> +	unsigned long		nr,
> +	const unsigned char	*name,
> +	size_t			namelen,
> +	int			fd)
> +{
> +	unsigned char		xattrname[MAXNAMELEN + 5];
> +	struct dup_table	*tab = NULL;
> +	xfs_dahash_t		old_hash;
> +	unsigned long		i;
> +	int			error;
> +
> +	old_hash = libxfs_da_hashname(name, namelen);
> +
> +	if (fd >= 0) {
> +		/*
> +		 * User passed in a fd, so we'll use the xattr structure to
> +		 * detect duplicate names.  First create the attribute that we
> +		 * are passed in.
> +		 */
> +		snprintf(xattrname, MAXNAMELEN + 5, "user.%s", name);
> +		error = fsetxattr(fd, xattrname, "1", 1, 0);
> +		if (error)
> +			return errno;
> +	} else if (nr > 1) {
> +		/*
> +		 * Track every name we create so that we don't emit duplicates.
> +		 */
> +		error = dup_table_alloc(nr, &tab);
> +		if (error)
> +			return error;
> +	}
> +
> +	for (i = 0; i < nr; i++) {
> +		snprintf(xattrname, MAXNAMELEN + 5, "user.%s", name);
> +		obfuscate_name(old_hash, namelen, xattrname + 5, false);
> +		ASSERT(old_hash == libxfs_da_hashname(xattrname + 5, namelen));
> +
> +		if (fd >= 0) {
> +			error = fsetxattr(fd, xattrname, "1", 1, 0);
> +			if (error)
> +				return errno;
> +
> +			/* don't print names to stdout */
> +			continue;
> +		} else if (tab) {
> +			error = dup_table_store(tab, xattrname, namelen + 5);
> +			if (error)
> +				break;
> +		}
> +
> +		printf("%s%c", xattrname, 0);
> +	}
> +
> +	dup_table_free(tab);
> +	return error;
> +}
> +
> +static int
> +hashcoll_f(
> +	int		argc,
> +	char		**argv)
> +{
> +	const char	*path = NULL;
> +	bool		read_stdin = false;
> +	bool		create_xattr = false;
> +	unsigned long	nr = 1, seed = 0;
> +	int		fd = -1;
> +	int		c;
> +	int		error;
> +
> +	while ((c = getopt(argc, argv, "ain:p:s:")) != EOF) {
> +		switch (c) {
> +		case 'a':
> +			create_xattr = true;
> +			break;
> +		case 'i':
> +			read_stdin = true;
> +			break;
> +		case 'n':
> +			nr = strtoul(optarg, NULL, 10);
> +			break;
> +		case 'p':
> +			path = optarg;
> +			break;
> +		case 's':
> +			seed = strtoul(optarg, NULL, 10);
> +			break;
> +		default:
> +			exitcode = 1;
> +			hashcoll_help();
> +			return 0;
> +		}
> +	}
> +
> +	if (path) {
> +		int	oflags = O_RDWR;
> +
> +		if (!create_xattr)
> +			oflags = O_RDONLY | O_DIRECTORY;
> +
> +		fd = open(path, oflags);
> +		if (fd < 0) {
> +			perror(path);
> +			exitcode = 1;
> +			return 0;
> +		}
> +	}
> +
> +	if (seed)
> +		srandom(seed);
> +
> +	if (read_stdin) {
> +		char	buf[MAXNAMELEN];
> +		size_t	len;
> +
> +		len = fread(buf, 1, MAXNAMELEN - 1, stdin);
> +
> +		if (create_xattr)
> +			error = collide_xattrs(nr, buf, len, fd);
> +		else
> +			error = collide_dirents(nr, buf, len, fd);
> +		if (error) {
> +			printf(_("hashcoll: %s\n"), strerror(error));
> +			exitcode = 1;
> +		}
> +		goto done;
> +	}
> +
> +	for (c = optind; c < argc; c++) {
> +		size_t	len = strlen(argv[c]);
> +
> +		if (create_xattr)
> +			error = collide_xattrs(nr, argv[c], len, fd);
> +		else
> +			error = collide_dirents(nr, argv[c], len, fd);
> +		if (error) {
> +			printf(_("hashcoll: %s\n"), strerror(error));
> +			exitcode = 1;
> +		}
> +	}
> +
> +done:
> +	if (fd >= 0)
> +		close(fd);
> +	return 0;
> +}
> +
> +static cmdinfo_t	hashcoll_cmd = {
> +	.name		= "hashcoll",
> +	.cfunc		= hashcoll_f,
> +	.argmin		= 0,
> +	.argmax		= -1,
> +	.args		= N_("[-a] [-s seed] [-n nr] [-p path] -i|names..."),
> +	.oneline	= N_("create names that produce dahash collisions"),
> +	.help		= hashcoll_help,
> +};
> +
>  void
>  hash_init(void)
>  {
>  	add_command(&hash_cmd);
> +	add_command(&hashcoll_cmd);
>  }
> diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
> index 1a2bb7e98fb..fde1c5c6c69 100644
> --- a/man/man8/xfs_db.8
> +++ b/man/man8/xfs_db.8
> @@ -768,6 +768,37 @@ Prints the hash value of
>  .I string
>  using the hash function of the XFS directory and attribute implementation.
>  .TP
> +.BI "hashcoll [-a] [-s seed] [-n " nr "] [-p " path "] -i | " names...
> +Create directory entries or extended attributes names that all have the same
> +hash value.
> +The metadump name obfuscation algorithm is used here.
> +Names are written to standard output, with a NULL between each name for use
> +with xargs -0.
> +.RS 1.0i
> +.PD 0
> +.TP 0.4i
> +.TP 0.4i
> +.B \-a
> +Create extended attribute names.
> +.TP 0.4i
> +.B \-i
> +Read the first name to create from standard input.
> +Up to 255 bytes are read.
> +If this option is not specified, first names are taken from the command line.
> +.TP 0.4i
> +.BI \-n " nr"
> +Create this many duplicated names.
> +The default is to create one name.
> +.TP 0.4i
> +.BI \-p " path"
> +Create directory entries or extended attributes in this file instead of
> +writing the names to standard output.
> +.TP 0.4i
> +.BI \-s " seed"
> +Seed the random number generator with this value.
> +.PD
> +.RE
> +.TP
>  .BI "help [" command ]
>  Print help for one or all commands.
>  .TP
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] xfs_db: make the hash command print the dirent hash
  2023-06-05 15:37 ` [PATCH 3/3] xfs_db: make the hash command print the dirent hash Darrick J. Wong
@ 2023-06-06 11:36   ` Carlos Maiolino
  0 siblings, 0 replies; 11+ messages in thread
From: Carlos Maiolino @ 2023-06-06 11:36 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Mon, Jun 05, 2023 at 08:37:15AM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> It turns out that the da and dir2 hashname functions are /not/ the same,
> at least not on ascii-ci filesystems.  Enhance this debugger command to
> support printing the dir2 hashname.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>

Looks good, will test.

Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>

> ---
>  db/hash.c         |   42 +++++++++++++++++++++++++++++++++++++-----
>  man/man8/xfs_db.8 |    8 +++++++-
>  2 files changed, 44 insertions(+), 6 deletions(-)
> 
> 
> diff --git a/db/hash.c b/db/hash.c
> index 79a250526e9..716da88baf9 100644
> --- a/db/hash.c
> +++ b/db/hash.c
> @@ -18,9 +18,15 @@
>  static int hash_f(int argc, char **argv);
>  static void hash_help(void);
> 
> -static const cmdinfo_t hash_cmd =
> -	{ "hash", NULL, hash_f, 1, 1, 0, N_("string"),
> -	  N_("calculate hash value"), hash_help };
> +static const cmdinfo_t hash_cmd = {
> +	.name		= "hash",
> +	.cfunc		= hash_f,
> +	.argmin		= 1,
> +	.argmax		= -1,
> +	.args		= N_("string"),
> +	.oneline	= N_("calculate hash value"),
> +	.help		= hash_help,
> +};
> 
>  static void
>  hash_help(void)
> @@ -43,9 +49,35 @@ hash_f(
>  	char		**argv)
>  {
>  	xfs_dahash_t	hashval;
> +	bool		use_dir2_hash = false;
> +	int		c;
> +
> +	while ((c = getopt(argc, argv, "d")) != EOF) {
> +		switch (c) {
> +		case 'd':
> +			use_dir2_hash = true;
> +			break;
> +		default:
> +			exitcode = 1;
> +			hash_help();
> +			return 0;
> +		}
> +	}
> +
> +	for (c = optind; c < argc; c++) {
> +		if (use_dir2_hash) {
> +			struct xfs_name	xname = {
> +				.name	= (uint8_t *)argv[c],
> +				.len	= strlen(argv[c]),
> +			};
> +
> +			hashval = libxfs_dir2_hashname(mp, &xname);
> +		} else {
> +			hashval = libxfs_da_hashname(argv[c], strlen(argv[c]));
> +		}
> +		dbprintf("0x%x\n", hashval);
> +	}
> 
> -	hashval = libxfs_da_hashname((unsigned char *)argv[1], (int)strlen(argv[1]));
> -	dbprintf("0x%x\n", hashval);
>  	return 0;
>  }
> 
> diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
> index fde1c5c6c69..60dcdc52cba 100644
> --- a/man/man8/xfs_db.8
> +++ b/man/man8/xfs_db.8
> @@ -763,10 +763,16 @@ Skip write verifiers but perform CRC recalculation; allows invalid data to be
>  written to disk to test detection of invalid data.
>  .RE
>  .TP
> -.BI hash " string
> +.BI hash [-d]" strings
>  Prints the hash value of
>  .I string
>  using the hash function of the XFS directory and attribute implementation.
> +
> +If the
> +.B \-d
> +option is specified, the directory-specific hash function is used.
> +This only makes a difference on filesystems with ascii case-insensitive
> +lookups enabled.
>  .TP
>  .BI "hashcoll [-a] [-s seed] [-n " nr "] [-p " path "] -i | " names...
>  Create directory entries or extended attributes names that all have the same
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/3] xfs_db: create dirents and xattrs with colliding names
  2023-06-05 15:37 ` [PATCH 2/3] xfs_db: create dirents and xattrs with colliding names Darrick J. Wong
  2023-06-06 11:33   ` Carlos Maiolino
@ 2023-06-14 11:32   ` Carlos Maiolino
  1 sibling, 0 replies; 11+ messages in thread
From: Carlos Maiolino @ 2023-06-14 11:32 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

Hi Darrick.

On Mon, Jun 05, 2023 at 08:37:09AM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> Create a new debugger command that will create dirent and xattr names
> that induce dahash collisions.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> ---
>  db/hash.c         |  376 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  man/man8/xfs_db.8 |   31 ++++
>  2 files changed, 407 insertions(+)

This patch is trigerring a bunch of warnings due differing type on db/hash.c, like:


=====
hash.c: In function ‘collide_dirents’:
hash.c:226:22: warning: pointer targets in passing argument 2 of ‘openat’ differ in signedness [-Wpointer-sign]
  226 |   newfd = openat(fd, name, O_CREAT, 0600);
      |                      ^~~~
      |                      |
      |                      const unsigned char *
In file included from /usr/include/features.h:461,
                 from /usr/include/x86_64-linux-gnu/bits/libc-header-start.h:33,
                 from /usr/include/stdio.h:27,
                 from ../include/platform_defs.h:10,
                 from ../include/libxfs.h:11,
                 from hash.c:7:
/usr/include/fcntl.h:196:12: note: expected ‘const char *’ but argument is of type ‘const unsigned char *’
  196 | extern int __REDIRECT (openat, (int __fd, const char *__file, int __oflag,
      |            ^~~~~~~~~~
hash.c:241:11: warning: pointer targets in passing argument 1 of ‘strncpy’ differ in signedness [-Wpointer-sign]
  241 |   strncpy(direntname, name, MAXNAMELEN);
      |           ^~~~~~~~~~
      |           |
      |           unsigned char *
In file included from ../include/platform_defs.h:17,
                 from ../include/libxfs.h:11,
                 from hash.c:7:

=====

Could you please take a look onto it?

Thanks!

-- 
Carlos

> 
> 
> diff --git a/db/hash.c b/db/hash.c
> index 68c53e7f9bc..79a250526e9 100644
> --- a/db/hash.c
> +++ b/db/hash.c
> @@ -5,12 +5,15 @@
>   */
> 
>  #include "libxfs.h"
> +#include "init.h"
>  #include "addr.h"
>  #include "command.h"
>  #include "type.h"
>  #include "io.h"
>  #include "output.h"
>  #include "hash.h"
> +#include "obfuscate.h"
> +#include <sys/xattr.h>
> 
>  static int hash_f(int argc, char **argv);
>  static void hash_help(void);
> @@ -46,8 +49,381 @@ hash_f(
>  	return 0;
>  }
> 
> +static void
> +hashcoll_help(void)
> +{
> +	printf(_(
> +"\n"
> +" Generate obfuscated variants of the provided name.  Each variant will have\n"
> +" the same dahash value.  Names are written to stdout with a NULL separating\n"
> +" each name.\n"
> +"\n"
> +" -a -- create extended attributes.\n"
> +" -i -- read standard input for the name, up to %d bytes.\n"
> +" -n -- create this many names.\n"
> +" -p -- create directory entries or extended attributes in this file.\n"
> +" -s -- seed the rng with this value.\n"
> +"\n"),
> +			MAXNAMELEN - 1);
> +}
> +
> +struct name_dup {
> +	struct name_dup	*next;
> +	uint32_t	crc;
> +	uint8_t		namelen;
> +	uint8_t		name[];
> +};
> +
> +static inline size_t
> +name_dup_sizeof(
> +	unsigned int	namelen)
> +{
> +	return sizeof(struct name_dup) + namelen;
> +}
> +
> +#define MAX_DUP_TABLE_BUCKETS	(1048575)
> +
> +struct dup_table {
> +	unsigned int	nr_buckets;
> +	struct name_dup	*buckets[];
> +};
> +
> +static inline size_t
> +dup_table_sizeof(
> +	unsigned int	nr_buckets)
> +{
> +	return sizeof(struct dup_table) +
> +				(nr_buckets * sizeof(struct name_dup *));
> +}
> +
> +static int
> +dup_table_alloc(
> +	unsigned long		nr_names,
> +	struct dup_table	**tabp)
> +{
> +	struct dup_table	*t;
> +
> +	*tabp = NULL;
> +
> +	if (nr_names == 1)
> +		return 0;
> +
> +	nr_names = min(MAX_DUP_TABLE_BUCKETS, nr_names);
> +	t = calloc(1, dup_table_sizeof(nr_names));
> +	if (!t)
> +		return ENOMEM;
> +
> +	t->nr_buckets = nr_names;
> +	*tabp = t;
> +	return 0;
> +}
> +
> +static void
> +dup_table_free(
> +	struct dup_table	*tab)
> +{
> +	struct name_dup		*ent, *next;
> +	unsigned int		i;
> +
> +	if (!tab)
> +		return;
> +
> +	for (i = 0; i < tab->nr_buckets; i++) {
> +		ent = tab->buckets[i];
> +
> +		while (ent) {
> +			next = ent->next;
> +			free(ent);
> +			ent = next;
> +		}
> +	}
> +	free(tab);
> +}
> +
> +static struct name_dup *
> +dup_table_find(
> +	struct dup_table	*tab,
> +	unsigned char		*name,
> +	size_t			namelen)
> +{
> +	struct name_dup		*ent;
> +	uint32_t		crc = crc32c(~0, name, namelen);
> +
> +	ent = tab->buckets[crc % tab->nr_buckets];
> +	while (ent) {
> +		if (ent->crc == crc &&
> +		    ent->namelen == namelen &&
> +		    !memcmp(ent->name, name, namelen))
> +			return ent;
> +
> +		ent = ent->next;
> +	}
> +
> +	return NULL;
> +}
> +
> +static int
> +dup_table_store(
> +	struct dup_table	*tab,
> +	unsigned char		*name,
> +	size_t			namelen)
> +{
> +	struct name_dup		*dup;
> +	uint32_t		seq = 1;
> +
> +	ASSERT(namelen < MAXNAMELEN);
> +
> +	while ((dup = dup_table_find(tab, name, namelen)) != NULL) {
> +		int		ret;
> +
> +		do {
> +			ret = find_alternate(namelen, name, seq++);
> +		} while (ret == 0);
> +		if (ret < 0)
> +			return EEXIST;
> +	}
> +
> +	dup = malloc(name_dup_sizeof(namelen));
> +	if (!dup)
> +		return ENOMEM;
> +
> +	dup->crc = crc32c(~0, name, namelen);
> +	dup->namelen = namelen;
> +	memcpy(dup->name, name, namelen);
> +	dup->next = tab->buckets[dup->crc % tab->nr_buckets];
> +
> +	tab->buckets[dup->crc % tab->nr_buckets] = dup;
> +	return 0;
> +}
> +
> +static int
> +collide_dirents(
> +	unsigned long		nr,
> +	const unsigned char	*name,
> +	size_t			namelen,
> +	int			fd)
> +{
> +	struct xfs_name		dname = {
> +		.name		= name,
> +		.len		= namelen,
> +	};
> +	unsigned char		direntname[MAXNAMELEN + 1];
> +	struct dup_table	*tab = NULL;
> +	xfs_dahash_t		old_hash;
> +	unsigned long		i;
> +	int			error = 0;
> +
> +	old_hash = libxfs_dir2_hashname(mp, &dname);
> +
> +	if (fd >= 0) {
> +		int		newfd;
> +
> +		/*
> +		 * User passed in a fd, so we'll use the directory to detect
> +		 * duplicate names.  First create the name that we are passed
> +		 * in; the new names will be hardlinks to the first file.
> +		 */
> +		newfd = openat(fd, name, O_CREAT, 0600);
> +		if (newfd < 0)
> +			return errno;
> +		close(newfd);
> +	} else if (nr > 1) {
> +		/*
> +		 * Track every name we create so that we don't emit duplicates.
> +		 */
> +		error = dup_table_alloc(nr, &tab);
> +		if (error)
> +			return error;
> +	}
> +
> +	dname.name = direntname;
> +	for (i = 0; i < nr; i++) {
> +		strncpy(direntname, name, MAXNAMELEN);
> +		obfuscate_name(old_hash, namelen, direntname, true);
> +		ASSERT(old_hash == libxfs_dir2_hashname(mp, &dname));
> +
> +		if (fd >= 0) {
> +			error = linkat(fd, name, fd, direntname, 0);
> +			if (error && errno != EEXIST)
> +				return errno;
> +
> +			/* don't print names to stdout */
> +			continue;
> +		} else if (tab) {
> +			error = dup_table_store(tab, direntname, namelen);
> +			if (error)
> +				break;
> +		}
> +
> +		printf("%s%c", direntname, 0);
> +	}
> +
> +	dup_table_free(tab);
> +	return error;
> +}
> +
> +static int
> +collide_xattrs(
> +	unsigned long		nr,
> +	const unsigned char	*name,
> +	size_t			namelen,
> +	int			fd)
> +{
> +	unsigned char		xattrname[MAXNAMELEN + 5];
> +	struct dup_table	*tab = NULL;
> +	xfs_dahash_t		old_hash;
> +	unsigned long		i;
> +	int			error;
> +
> +	old_hash = libxfs_da_hashname(name, namelen);
> +
> +	if (fd >= 0) {
> +		/*
> +		 * User passed in a fd, so we'll use the xattr structure to
> +		 * detect duplicate names.  First create the attribute that we
> +		 * are passed in.
> +		 */
> +		snprintf(xattrname, MAXNAMELEN + 5, "user.%s", name);
> +		error = fsetxattr(fd, xattrname, "1", 1, 0);
> +		if (error)
> +			return errno;
> +	} else if (nr > 1) {
> +		/*
> +		 * Track every name we create so that we don't emit duplicates.
> +		 */
> +		error = dup_table_alloc(nr, &tab);
> +		if (error)
> +			return error;
> +	}
> +
> +	for (i = 0; i < nr; i++) {
> +		snprintf(xattrname, MAXNAMELEN + 5, "user.%s", name);
> +		obfuscate_name(old_hash, namelen, xattrname + 5, false);
> +		ASSERT(old_hash == libxfs_da_hashname(xattrname + 5, namelen));
> +
> +		if (fd >= 0) {
> +			error = fsetxattr(fd, xattrname, "1", 1, 0);
> +			if (error)
> +				return errno;
> +
> +			/* don't print names to stdout */
> +			continue;
> +		} else if (tab) {
> +			error = dup_table_store(tab, xattrname, namelen + 5);
> +			if (error)
> +				break;
> +		}
> +
> +		printf("%s%c", xattrname, 0);
> +	}
> +
> +	dup_table_free(tab);
> +	return error;
> +}
> +
> +static int
> +hashcoll_f(
> +	int		argc,
> +	char		**argv)
> +{
> +	const char	*path = NULL;
> +	bool		read_stdin = false;
> +	bool		create_xattr = false;
> +	unsigned long	nr = 1, seed = 0;
> +	int		fd = -1;
> +	int		c;
> +	int		error;
> +
> +	while ((c = getopt(argc, argv, "ain:p:s:")) != EOF) {
> +		switch (c) {
> +		case 'a':
> +			create_xattr = true;
> +			break;
> +		case 'i':
> +			read_stdin = true;
> +			break;
> +		case 'n':
> +			nr = strtoul(optarg, NULL, 10);
> +			break;
> +		case 'p':
> +			path = optarg;
> +			break;
> +		case 's':
> +			seed = strtoul(optarg, NULL, 10);
> +			break;
> +		default:
> +			exitcode = 1;
> +			hashcoll_help();
> +			return 0;
> +		}
> +	}
> +
> +	if (path) {
> +		int	oflags = O_RDWR;
> +
> +		if (!create_xattr)
> +			oflags = O_RDONLY | O_DIRECTORY;
> +
> +		fd = open(path, oflags);
> +		if (fd < 0) {
> +			perror(path);
> +			exitcode = 1;
> +			return 0;
> +		}
> +	}
> +
> +	if (seed)
> +		srandom(seed);
> +
> +	if (read_stdin) {
> +		char	buf[MAXNAMELEN];
> +		size_t	len;
> +
> +		len = fread(buf, 1, MAXNAMELEN - 1, stdin);
> +
> +		if (create_xattr)
> +			error = collide_xattrs(nr, buf, len, fd);
> +		else
> +			error = collide_dirents(nr, buf, len, fd);
> +		if (error) {
> +			printf(_("hashcoll: %s\n"), strerror(error));
> +			exitcode = 1;
> +		}
> +		goto done;
> +	}
> +
> +	for (c = optind; c < argc; c++) {
> +		size_t	len = strlen(argv[c]);
> +
> +		if (create_xattr)
> +			error = collide_xattrs(nr, argv[c], len, fd);
> +		else
> +			error = collide_dirents(nr, argv[c], len, fd);
> +		if (error) {
> +			printf(_("hashcoll: %s\n"), strerror(error));
> +			exitcode = 1;
> +		}
> +	}
> +
> +done:
> +	if (fd >= 0)
> +		close(fd);
> +	return 0;
> +}
> +
> +static cmdinfo_t	hashcoll_cmd = {
> +	.name		= "hashcoll",
> +	.cfunc		= hashcoll_f,
> +	.argmin		= 0,
> +	.argmax		= -1,
> +	.args		= N_("[-a] [-s seed] [-n nr] [-p path] -i|names..."),
> +	.oneline	= N_("create names that produce dahash collisions"),
> +	.help		= hashcoll_help,
> +};
> +
>  void
>  hash_init(void)
>  {
>  	add_command(&hash_cmd);
> +	add_command(&hashcoll_cmd);
>  }
> diff --git a/man/man8/xfs_db.8 b/man/man8/xfs_db.8
> index 1a2bb7e98fb..fde1c5c6c69 100644
> --- a/man/man8/xfs_db.8
> +++ b/man/man8/xfs_db.8
> @@ -768,6 +768,37 @@ Prints the hash value of
>  .I string
>  using the hash function of the XFS directory and attribute implementation.
>  .TP
> +.BI "hashcoll [-a] [-s seed] [-n " nr "] [-p " path "] -i | " names...
> +Create directory entries or extended attributes names that all have the same
> +hash value.
> +The metadump name obfuscation algorithm is used here.
> +Names are written to standard output, with a NULL between each name for use
> +with xargs -0.
> +.RS 1.0i
> +.PD 0
> +.TP 0.4i
> +.TP 0.4i
> +.B \-a
> +Create extended attribute names.
> +.TP 0.4i
> +.B \-i
> +Read the first name to create from standard input.
> +Up to 255 bytes are read.
> +If this option is not specified, first names are taken from the command line.
> +.TP 0.4i
> +.BI \-n " nr"
> +Create this many duplicated names.
> +The default is to create one name.
> +.TP 0.4i
> +.BI \-p " path"
> +Create directory entries or extended attributes in this file instead of
> +writing the names to standard output.
> +.TP 0.4i
> +.BI \-s " seed"
> +Seed the random number generator with this value.
> +.PD
> +.RE
> +.TP
>  .BI "help [" command ]
>  Print help for one or all commands.
>  .TP
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2 1/3] xfs_db: hoist name obfuscation code out of metadump.c
  2023-06-05 15:37 ` [PATCH 1/3] xfs_db: hoist name obfuscation code out of metadump.c Darrick J. Wong
  2023-06-06 11:17   ` Carlos Maiolino
@ 2023-06-15 16:12   ` Darrick J. Wong
  2023-06-22 11:55     ` Carlos Maiolino
  1 sibling, 1 reply; 11+ messages in thread
From: Darrick J. Wong @ 2023-06-15 16:12 UTC (permalink / raw)
  To: cem; +Cc: linux-xfs

From: Darrick J. Wong <djwong@kernel.org>

We want to create a debugger command that will create obfuscated names
for directory and xattr names, so hoist the name obfuscation code into a
separate file.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
---
v2: rebase to deal with s/alloca/malloc change
---
 db/Makefile    |    2 
 db/metadump.c  |  388 -------------------------------------------------------
 db/obfuscate.c |  393 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 db/obfuscate.h |   17 ++
 4 files changed, 412 insertions(+), 388 deletions(-)
 create mode 100644 db/obfuscate.c
 create mode 100644 db/obfuscate.h

diff --git a/db/Makefile b/db/Makefile
index b2e01174571..2f95f67075d 100644
--- a/db/Makefile
+++ b/db/Makefile
@@ -13,7 +13,7 @@ HFILES = addr.h agf.h agfl.h agi.h attr.h attrshort.h bit.h block.h bmap.h \
 	flist.h fprint.h frag.h freesp.h hash.h help.h init.h inode.h input.h \
 	io.h logformat.h malloc.h metadump.h output.h print.h quit.h sb.h \
 	sig.h strvec.h text.h type.h write.h attrset.h symlink.h fsmap.h \
-	fuzz.h
+	fuzz.h obfuscate.h
 CFILES = $(HFILES:.h=.c) btdump.c btheight.c convert.c info.c namei.c \
 	timelimit.c
 LSRCFILES = xfs_admin.sh xfs_ncheck.sh xfs_metadump.sh
diff --git a/db/metadump.c b/db/metadump.c
index 9ccee0b7ace..d9a616a9296 100644
--- a/db/metadump.c
+++ b/db/metadump.c
@@ -19,6 +19,7 @@
 #include "faddr.h"
 #include "field.h"
 #include "dir2.h"
+#include "obfuscate.h"
 
 #define DEFAULT_MAX_EXT_SIZE	XFS_MAX_BMBT_EXTLEN
 
@@ -736,19 +737,6 @@ nametable_add(xfs_dahash_t hash, int namelen, unsigned char *name)
 	return ent;
 }
 
-#define is_invalid_char(c)	((c) == '/' || (c) == '\0')
-#define rol32(x,y)		(((x) << (y)) | ((x) >> (32 - (y))))
-
-static inline unsigned char
-random_filename_char(void)
-{
-	static unsigned char filename_alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-						"abcdefghijklmnopqrstuvwxyz"
-						"0123456789-_";
-
-	return filename_alphabet[random() % (sizeof filename_alphabet - 1)];
-}
-
 #define	ORPHANAGE	"lost+found"
 #define	ORPHANAGE_LEN	(sizeof (ORPHANAGE) - 1)
 
@@ -808,380 +796,6 @@ in_lost_found(
 	return slen == namelen && !memcmp(name, s, namelen);
 }
 
-/*
- * Given a name and its hash value, massage the name in such a way
- * that the result is another name of equal length which shares the
- * same hash value.
- */
-static void
-obfuscate_name(
-	xfs_dahash_t	hash,
-	size_t		name_len,
-	unsigned char	*name,
-	bool		is_dirent)
-{
-	unsigned char	*oldname = NULL;
-	unsigned char	*newp;
-	int		i;
-	xfs_dahash_t	new_hash;
-	unsigned char	*first;
-	unsigned char	high_bit;
-	int		tries = 0;
-	bool		is_ci_name = is_dirent && xfs_has_asciici(mp);
-	int		shift;
-
-	/*
-	 * Our obfuscation algorithm requires at least 5-character
-	 * names, so don't bother if the name is too short.  We
-	 * work backward from a hash value to determine the last
-	 * five bytes in a name required to produce a new name
-	 * with the same hash.
-	 */
-	if (name_len < 5)
-		return;
-
-	if (is_ci_name) {
-		oldname = malloc(name_len);
-		if (!oldname)
-			return;
-		memcpy(oldname, name, name_len);
-	}
-
-again:
-	newp = name;
-	new_hash = 0;
-
-	/*
-	 * If we cannot generate a ci-compatible obfuscated name after 1000
-	 * tries, don't bother obfuscating the name.
-	 */
-	if (tries++ > 1000) {
-		memcpy(name, oldname, name_len);
-		goto out_free;
-	}
-
-	/*
-	 * The beginning of the obfuscated name can be pretty much
-	 * anything, so fill it in with random characters.
-	 * Accumulate its new hash value as we go.
-	 */
-	for (i = 0; i < name_len - 5; i++) {
-		*newp = random_filename_char();
-		if (is_ci_name)
-			new_hash = xfs_ascii_ci_xfrm(*newp) ^
-							rol32(new_hash, 7);
-		else
-			new_hash = *newp ^ rol32(new_hash, 7);
-		newp++;
-	}
-
-	/*
-	 * Compute which five bytes need to be used at the end of
-	 * the name so the hash of the obfuscated name is the same
-	 * as the hash of the original.  If any result in an invalid
-	 * character, flip a bit and arrange for a corresponding bit
-	 * in a neighboring byte to be flipped as well.  For the
-	 * last byte, the "neighbor" to change is the first byte
-	 * we're computing here.
-	 */
-	new_hash = rol32(new_hash, 3) ^ hash;
-
-	first = newp;
-	high_bit = 0;
-	for (shift = 28; shift >= 0; shift -= 7) {
-		*newp = (new_hash >> shift & 0x7f) ^ high_bit;
-		if (is_invalid_char(*newp)) {
-			*newp ^= 1;
-			high_bit = 0x80;
-		} else
-			high_bit = 0;
-
-		/*
-		 * If ascii-ci is enabled, uppercase characters are converted
-		 * to lowercase characters while computing the name hash.  If
-		 * any of the necessary correction bytes are uppercase, the
-		 * hash of the new name will not match.  Try again with a
-		 * different prefix.
-		 */
-		if (is_ci_name && xfs_ascii_ci_need_xfrm(*newp))
-			goto again;
-
-		ASSERT(!is_invalid_char(*newp));
-		newp++;
-	}
-
-	/*
-	 * If we flipped a bit on the last byte, we need to fix up
-	 * the matching bit in the first byte.  The result will
-	 * be a valid character, because we know that first byte
-	 * has 0's in its upper four bits (it was produced by a
-	 * 28-bit right-shift of a 32-bit unsigned value).
-	 */
-	if (high_bit) {
-		*first ^= 0x10;
-
-		if (is_ci_name && xfs_ascii_ci_need_xfrm(*first))
-			goto again;
-
-		ASSERT(!is_invalid_char(*first));
-	}
-
-out_free:
-	free(oldname);
-}
-
-/*
- * Flip a bit in each of two bytes at the end of the given name.
- * This is used in generating a series of alternate names to be used
- * in the event a duplicate is found.
- *
- * The bits flipped are selected such that they both affect the same
- * bit in the name's computed hash value, so flipping them both will
- * preserve the hash.
- *
- * The following diagram aims to show the portion of a computed
- * hash that a given byte of a name affects.
- *
- *	   31    28      24    21	     14		  8 7       3     0
- *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
- * hash:   | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
- *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
- *	  last-4 ->|	       |<-- last-2 --->|	   |<--- last ---->|
- *		 |<-- last-3 --->|	     |<-- last-1 --->|     |<- last-4
- *			 |<-- last-7 --->|	     |<-- last-5 --->|
- *	   |<-- last-8 --->|	       |<-- last-6 --->|
- *			. . . and so on
- *
- * The last byte of the name directly affects the low-order byte of
- * the hash.  The next-to-last affects bits 7-14, the next one back
- * affects bits 14-21, and so on.  The effect wraps around when it
- * goes beyond the top of the hash (as happens for byte last-4).
- *
- * Bits that are flipped together "overlap" on the hash value.  As
- * an example of overlap, the last two bytes both affect bit 7 in
- * the hash.  That pair of bytes (and their overlapping bits) can be
- * used for this "flip bit" operation (it's the first pair tried,
- * actually).
- *
- * A table defines overlapping pairs--the bytes involved and bits
- * within them--that can be used this way.  The byte offset is
- * relative to a starting point within the name, which will be set
- * to affect the bytes at the end of the name.  The function is
- * called with a "bitseq" value which indicates which bit flip is
- * desired, and this translates directly into selecting which entry
- * in the bit_to_flip[] table to apply.
- *
- * The function returns 1 if the operation was successful.  It
- * returns 0 if the result produced a character that's not valid in
- * a name (either '/' or a '\0').  Finally, it returns -1 if the bit
- * sequence number is beyond what is supported for a name of this
- * length.
- *
- * Discussion
- * ----------
- * (Also see the discussion above find_alternate(), below.)
- *
- * In order to make this function work for any length name, the
- * table is ordered by increasing byte offset, so that the earliest
- * entries can apply to the shortest strings.  This way all names
- * are done consistently.
- *
- * When bit flips occur, they can convert printable characters
- * into non-printable ones.  In an effort to reduce the impact of
- * this, the first bit flips are chosen to affect bytes the end of
- * the name (and furthermore, toward the low bits of a byte).  Those
- * bytes are often non-printable anyway because of the way they are
- * initially selected by obfuscate_name()).  This is accomplished,
- * using later table entries first.
- *
- * Each row in the table doubles the number of alternates that
- * can be generated.  A two-byte name is limited to using only
- * the first row, so it's possible to generate two alternates
- * (the original name, plus the alternate produced by flipping
- * the one pair of bits).  In a 5-byte name, the effect of the
- * first byte overlaps the last by 4 its, and there are 8 bits
- * to flip, allowing for 256 possible alternates.
- *
- * Short names (less than 5 bytes) are never even obfuscated, so for
- * such names the relatively small number of alternates should never
- * really be a problem.
- *
- * Long names (more than 6 bytes, say) are not likely to exhaust
- * the number of available alternates.  In fact, the table could
- * probably have stopped at 8 entries, on the assumption that 256
- * alternates should be enough for most any situation.  The entries
- * beyond those are present mostly for demonstration of how it could
- * be populated with more entries, should it ever be necessary to do
- * so.
- */
-static int
-flip_bit(
-	size_t		name_len,
-	unsigned char	*name,
-	uint32_t	bitseq)
-{
-	int	index;
-	size_t	offset;
-	unsigned char *p0, *p1;
-	unsigned char m0, m1;
-	struct {
-	    int		byte;	/* Offset from start within name */
-	    unsigned char bit;	/* Bit within that byte */
-	} bit_to_flip[][2] = {	/* Sorted by second entry's byte */
-	    { { 0, 0 }, { 1, 7 } },	/* Each row defines a pair */
-	    { { 1, 0 }, { 2, 7 } },	/* of bytes and a bit within */
-	    { { 2, 0 }, { 3, 7 } },	/* each byte.  Each bit in */
-	    { { 0, 4 }, { 4, 0 } },	/* a pair affects the same */
-	    { { 0, 5 }, { 4, 1 } },	/* bit in the hash, so flipping */
-	    { { 0, 6 }, { 4, 2 } },	/* both will change the name */
-	    { { 0, 7 }, { 4, 3 } },	/* while preserving the hash. */
-	    { { 3, 0 }, { 4, 7 } },
-	    { { 0, 0 }, { 5, 3 } },	/* The first entry's byte offset */
-	    { { 0, 1 }, { 5, 4 } },	/* must be less than the second. */
-	    { { 0, 2 }, { 5, 5 } },
-	    { { 0, 3 }, { 5, 6 } },	/* The table can be extended to */
-	    { { 0, 4 }, { 5, 7 } },	/* an arbitrary number of entries */
-	    { { 4, 0 }, { 5, 7 } },	/* but there's not much point. */
-		/* . . . */
-	};
-
-	/* Find the first entry *not* usable for name of this length */
-
-	for (index = 0; index < ARRAY_SIZE(bit_to_flip); index++)
-		if (bit_to_flip[index][1].byte >= name_len)
-			break;
-
-	/*
-	 * Back up to the last usable entry.  If that number is
-	 * smaller than the bit sequence number, inform the caller
-	 * that nothing this large (or larger) will work.
-	 */
-	if (bitseq > --index)
-		return -1;
-
-	/*
-	 * We will be switching bits at the end of name, with a
-	 * preference for affecting the last bytes first.  Compute
-	 * where in the name we'll start applying the changes.
-	 */
-	offset = name_len - (bit_to_flip[index][1].byte + 1);
-	index -= bitseq;	/* Use later table entries first */
-
-	p0 = name + offset + bit_to_flip[index][0].byte;
-	p1 = name + offset + bit_to_flip[index][1].byte;
-	m0 = 1 << bit_to_flip[index][0].bit;
-	m1 = 1 << bit_to_flip[index][1].bit;
-
-	/* Only change the bytes if it produces valid characters */
-
-	if (is_invalid_char(*p0 ^ m0) || is_invalid_char(*p1 ^ m1))
-		return 0;
-
-	*p0 ^= m0;
-	*p1 ^= m1;
-
-	return 1;
-}
-
-/*
- * This function generates a well-defined sequence of "alternate"
- * names for a given name.  An alternate is a name having the same
- * length and same hash value as the original name.  This is needed
- * because the algorithm produces only one obfuscated name to use
- * for a given original name, and it's possible that result matches
- * a name already seen.  This function checks for this, and if it
- * occurs, finds another suitable obfuscated name to use.
- *
- * Each bit in the binary representation of the sequence number is
- * used to select one possible "bit flip" operation to perform on
- * the name.  So for example:
- *    seq = 0:	selects no bits to flip
- *    seq = 1:	selects the 0th bit to flip
- *    seq = 2:	selects the 1st bit to flip
- *    seq = 3:	selects the 0th and 1st bit to flip
- *    ... and so on.
- *
- * The flip_bit() function takes care of the details of the bit
- * flipping within the name.  Note that the "1st bit" in this
- * context is a bit sequence number; i.e. it doesn't necessarily
- * mean bit 0x02 will be changed.
- *
- * If a valid name (one that contains no '/' or '\0' characters) is
- * produced by this process for the given sequence number, this
- * function returns 1.  If the result is not valid, it returns 0.
- * Returns -1 if the sequence number is beyond the the maximum for
- * names of the given length.
- *
- *
- * Discussion
- * ----------
- * The number of alternates available for a given name is dependent
- * on its length.  A "bit flip" involves inverting two bits in
- * a name--the two bits being selected such that their values
- * affect the name's hash value in the same way.  Alternates are
- * thus generated by inverting the value of pairs of such
- * "overlapping" bits in the original name.  Each byte after the
- * first in a name adds at least one bit of overlap to work with.
- * (See comments above flip_bit() for more discussion on this.)
- *
- * So the number of alternates is dependent on the number of such
- * overlapping bits in a name.  If there are N bit overlaps, there
- * 2^N alternates for that hash value.
- *
- * Here are the number of overlapping bits available for generating
- * alternates for names of specific lengths:
- *	1	0	(must have 2 bytes to have any overlap)
- *	2	1	One bit overlaps--so 2 possible alternates
- *	3	2	Two bits overlap--so 4 possible alternates
- *	4	4	Three bits overlap, so 2^3 alternates
- *	5	8	8 bits overlap (due to wrapping), 256 alternates
- *	6	18	2^18 alternates
- *	7	28	2^28 alternates
- *	   ...
- * It's clear that the number of alternates grows very quickly with
- * the length of the name.  But note that the set of alternates
- * includes invalid names.  And for certain (contrived) names, the
- * number of valid names is a fairly small fraction of the total
- * number of alternates.
- *
- * The main driver for this infrastructure for coming up with
- * alternate names is really related to names 5 (or possibly 6)
- * bytes in length.  5-byte obfuscated names contain no randomly-
- * generated bytes in them, and the chance of an obfuscated name
- * matching an already-seen name is too high to just ignore.  This
- * methodical selection of alternates ensures we don't produce
- * duplicate names unless we have exhausted our options.
- */
-static int
-find_alternate(
-	size_t		name_len,
-	unsigned char	*name,
-	uint32_t	seq)
-{
-	uint32_t	bitseq = 0;
-	uint32_t	bits = seq;
-
-	if (!seq)
-		return 1;	/* alternate 0 is the original name */
-	if (name_len < 2)	/* Must have 2 bytes to flip */
-		return -1;
-
-	for (bitseq = 0; bits; bitseq++) {
-		uint32_t	mask = 1 << bitseq;
-		int		fb;
-
-		if (!(bits & mask))
-			continue;
-
-		fb = flip_bit(name_len, name, bitseq);
-		if (fb < 1)
-			return fb ? -1 : 0;
-		bits ^= mask;
-	}
-
-	return 1;
-}
-
 /*
  * Look up the given name in the name table.  If it is already
  * present, iterate through a well-defined sequence of alternate
diff --git a/db/obfuscate.c b/db/obfuscate.c
new file mode 100644
index 00000000000..cd950b44581
--- /dev/null
+++ b/db/obfuscate.c
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2007, 2011 SGI
+ * All Rights Reserved.
+ */
+#include "libxfs.h"
+#include "init.h"
+#include "obfuscate.h"
+
+static inline unsigned char
+random_filename_char(void)
+{
+	static unsigned char filename_alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+						"abcdefghijklmnopqrstuvwxyz"
+						"0123456789-_";
+
+	return filename_alphabet[random() % (sizeof filename_alphabet - 1)];
+}
+
+#define rol32(x,y)		(((x) << (y)) | ((x) >> (32 - (y))))
+
+/*
+ * Given a name and its hash value, massage the name in such a way
+ * that the result is another name of equal length which shares the
+ * same hash value.
+ */
+void
+obfuscate_name(
+	xfs_dahash_t	hash,
+	size_t		name_len,
+	unsigned char	*name,
+	bool		is_dirent)
+{
+	unsigned char	*oldname = NULL;
+	unsigned char	*newp;
+	int		i;
+	xfs_dahash_t	new_hash;
+	unsigned char	*first;
+	unsigned char	high_bit;
+	int		tries = 0;
+	bool		is_ci_name = is_dirent && xfs_has_asciici(mp);
+	int		shift;
+
+	/*
+	 * Our obfuscation algorithm requires at least 5-character
+	 * names, so don't bother if the name is too short.  We
+	 * work backward from a hash value to determine the last
+	 * five bytes in a name required to produce a new name
+	 * with the same hash.
+	 */
+	if (name_len < 5)
+		return;
+
+	if (is_ci_name) {
+		oldname = malloc(name_len);
+		if (!oldname)
+			return;
+		memcpy(oldname, name, name_len);
+	}
+
+again:
+	newp = name;
+	new_hash = 0;
+
+	/*
+	 * If we cannot generate a ci-compatible obfuscated name after 1000
+	 * tries, don't bother obfuscating the name.
+	 */
+	if (tries++ > 1000) {
+		memcpy(name, oldname, name_len);
+		goto out_free;
+	}
+
+	/*
+	 * The beginning of the obfuscated name can be pretty much
+	 * anything, so fill it in with random characters.
+	 * Accumulate its new hash value as we go.
+	 */
+	for (i = 0; i < name_len - 5; i++) {
+		*newp = random_filename_char();
+		if (is_ci_name)
+			new_hash = xfs_ascii_ci_xfrm(*newp) ^
+							rol32(new_hash, 7);
+		else
+			new_hash = *newp ^ rol32(new_hash, 7);
+		newp++;
+	}
+
+	/*
+	 * Compute which five bytes need to be used at the end of
+	 * the name so the hash of the obfuscated name is the same
+	 * as the hash of the original.  If any result in an invalid
+	 * character, flip a bit and arrange for a corresponding bit
+	 * in a neighboring byte to be flipped as well.  For the
+	 * last byte, the "neighbor" to change is the first byte
+	 * we're computing here.
+	 */
+	new_hash = rol32(new_hash, 3) ^ hash;
+
+	first = newp;
+	high_bit = 0;
+	for (shift = 28; shift >= 0; shift -= 7) {
+		*newp = (new_hash >> shift & 0x7f) ^ high_bit;
+		if (is_invalid_char(*newp)) {
+			*newp ^= 1;
+			high_bit = 0x80;
+		} else
+			high_bit = 0;
+
+		/*
+		 * If ascii-ci is enabled, uppercase characters are converted
+		 * to lowercase characters while computing the name hash.  If
+		 * any of the necessary correction bytes are uppercase, the
+		 * hash of the new name will not match.  Try again with a
+		 * different prefix.
+		 */
+		if (is_ci_name && xfs_ascii_ci_need_xfrm(*newp))
+			goto again;
+
+		ASSERT(!is_invalid_char(*newp));
+		newp++;
+	}
+
+	/*
+	 * If we flipped a bit on the last byte, we need to fix up
+	 * the matching bit in the first byte.  The result will
+	 * be a valid character, because we know that first byte
+	 * has 0's in its upper four bits (it was produced by a
+	 * 28-bit right-shift of a 32-bit unsigned value).
+	 */
+	if (high_bit) {
+		*first ^= 0x10;
+
+		if (is_ci_name && xfs_ascii_ci_need_xfrm(*first))
+			goto again;
+
+		ASSERT(!is_invalid_char(*first));
+	}
+out_free:
+	free(oldname);
+}
+
+/*
+ * Flip a bit in each of two bytes at the end of the given name.
+ * This is used in generating a series of alternate names to be used
+ * in the event a duplicate is found.
+ *
+ * The bits flipped are selected such that they both affect the same
+ * bit in the name's computed hash value, so flipping them both will
+ * preserve the hash.
+ *
+ * The following diagram aims to show the portion of a computed
+ * hash that a given byte of a name affects.
+ *
+ *	   31    28      24    21	     14		  8 7       3     0
+ *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
+ * hash:   | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+ *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
+ *	  last-4 ->|	       |<-- last-2 --->|	   |<--- last ---->|
+ *		 |<-- last-3 --->|	     |<-- last-1 --->|     |<- last-4
+ *			 |<-- last-7 --->|	     |<-- last-5 --->|
+ *	   |<-- last-8 --->|	       |<-- last-6 --->|
+ *			. . . and so on
+ *
+ * The last byte of the name directly affects the low-order byte of
+ * the hash.  The next-to-last affects bits 7-14, the next one back
+ * affects bits 14-21, and so on.  The effect wraps around when it
+ * goes beyond the top of the hash (as happens for byte last-4).
+ *
+ * Bits that are flipped together "overlap" on the hash value.  As
+ * an example of overlap, the last two bytes both affect bit 7 in
+ * the hash.  That pair of bytes (and their overlapping bits) can be
+ * used for this "flip bit" operation (it's the first pair tried,
+ * actually).
+ *
+ * A table defines overlapping pairs--the bytes involved and bits
+ * within them--that can be used this way.  The byte offset is
+ * relative to a starting point within the name, which will be set
+ * to affect the bytes at the end of the name.  The function is
+ * called with a "bitseq" value which indicates which bit flip is
+ * desired, and this translates directly into selecting which entry
+ * in the bit_to_flip[] table to apply.
+ *
+ * The function returns 1 if the operation was successful.  It
+ * returns 0 if the result produced a character that's not valid in
+ * a name (either '/' or a '\0').  Finally, it returns -1 if the bit
+ * sequence number is beyond what is supported for a name of this
+ * length.
+ *
+ * Discussion
+ * ----------
+ * (Also see the discussion above find_alternate(), below.)
+ *
+ * In order to make this function work for any length name, the
+ * table is ordered by increasing byte offset, so that the earliest
+ * entries can apply to the shortest strings.  This way all names
+ * are done consistently.
+ *
+ * When bit flips occur, they can convert printable characters
+ * into non-printable ones.  In an effort to reduce the impact of
+ * this, the first bit flips are chosen to affect bytes the end of
+ * the name (and furthermore, toward the low bits of a byte).  Those
+ * bytes are often non-printable anyway because of the way they are
+ * initially selected by obfuscate_name()).  This is accomplished,
+ * using later table entries first.
+ *
+ * Each row in the table doubles the number of alternates that
+ * can be generated.  A two-byte name is limited to using only
+ * the first row, so it's possible to generate two alternates
+ * (the original name, plus the alternate produced by flipping
+ * the one pair of bits).  In a 5-byte name, the effect of the
+ * first byte overlaps the last by 4 its, and there are 8 bits
+ * to flip, allowing for 256 possible alternates.
+ *
+ * Short names (less than 5 bytes) are never even obfuscated, so for
+ * such names the relatively small number of alternates should never
+ * really be a problem.
+ *
+ * Long names (more than 6 bytes, say) are not likely to exhaust
+ * the number of available alternates.  In fact, the table could
+ * probably have stopped at 8 entries, on the assumption that 256
+ * alternates should be enough for most any situation.  The entries
+ * beyond those are present mostly for demonstration of how it could
+ * be populated with more entries, should it ever be necessary to do
+ * so.
+ */
+static int
+flip_bit(
+	size_t		name_len,
+	unsigned char	*name,
+	uint32_t	bitseq)
+{
+	int	index;
+	size_t	offset;
+	unsigned char *p0, *p1;
+	unsigned char m0, m1;
+	struct {
+	    int		byte;	/* Offset from start within name */
+	    unsigned char bit;	/* Bit within that byte */
+	} bit_to_flip[][2] = {	/* Sorted by second entry's byte */
+	    { { 0, 0 }, { 1, 7 } },	/* Each row defines a pair */
+	    { { 1, 0 }, { 2, 7 } },	/* of bytes and a bit within */
+	    { { 2, 0 }, { 3, 7 } },	/* each byte.  Each bit in */
+	    { { 0, 4 }, { 4, 0 } },	/* a pair affects the same */
+	    { { 0, 5 }, { 4, 1 } },	/* bit in the hash, so flipping */
+	    { { 0, 6 }, { 4, 2 } },	/* both will change the name */
+	    { { 0, 7 }, { 4, 3 } },	/* while preserving the hash. */
+	    { { 3, 0 }, { 4, 7 } },
+	    { { 0, 0 }, { 5, 3 } },	/* The first entry's byte offset */
+	    { { 0, 1 }, { 5, 4 } },	/* must be less than the second. */
+	    { { 0, 2 }, { 5, 5 } },
+	    { { 0, 3 }, { 5, 6 } },	/* The table can be extended to */
+	    { { 0, 4 }, { 5, 7 } },	/* an arbitrary number of entries */
+	    { { 4, 0 }, { 5, 7 } },	/* but there's not much point. */
+		/* . . . */
+	};
+
+	/* Find the first entry *not* usable for name of this length */
+
+	for (index = 0; index < ARRAY_SIZE(bit_to_flip); index++)
+		if (bit_to_flip[index][1].byte >= name_len)
+			break;
+
+	/*
+	 * Back up to the last usable entry.  If that number is
+	 * smaller than the bit sequence number, inform the caller
+	 * that nothing this large (or larger) will work.
+	 */
+	if (bitseq > --index)
+		return -1;
+
+	/*
+	 * We will be switching bits at the end of name, with a
+	 * preference for affecting the last bytes first.  Compute
+	 * where in the name we'll start applying the changes.
+	 */
+	offset = name_len - (bit_to_flip[index][1].byte + 1);
+	index -= bitseq;	/* Use later table entries first */
+
+	p0 = name + offset + bit_to_flip[index][0].byte;
+	p1 = name + offset + bit_to_flip[index][1].byte;
+	m0 = 1 << bit_to_flip[index][0].bit;
+	m1 = 1 << bit_to_flip[index][1].bit;
+
+	/* Only change the bytes if it produces valid characters */
+
+	if (is_invalid_char(*p0 ^ m0) || is_invalid_char(*p1 ^ m1))
+		return 0;
+
+	*p0 ^= m0;
+	*p1 ^= m1;
+
+	return 1;
+}
+
+/*
+ * This function generates a well-defined sequence of "alternate"
+ * names for a given name.  An alternate is a name having the same
+ * length and same hash value as the original name.  This is needed
+ * because the algorithm produces only one obfuscated name to use
+ * for a given original name, and it's possible that result matches
+ * a name already seen.  This function checks for this, and if it
+ * occurs, finds another suitable obfuscated name to use.
+ *
+ * Each bit in the binary representation of the sequence number is
+ * used to select one possible "bit flip" operation to perform on
+ * the name.  So for example:
+ *    seq = 0:	selects no bits to flip
+ *    seq = 1:	selects the 0th bit to flip
+ *    seq = 2:	selects the 1st bit to flip
+ *    seq = 3:	selects the 0th and 1st bit to flip
+ *    ... and so on.
+ *
+ * The flip_bit() function takes care of the details of the bit
+ * flipping within the name.  Note that the "1st bit" in this
+ * context is a bit sequence number; i.e. it doesn't necessarily
+ * mean bit 0x02 will be changed.
+ *
+ * If a valid name (one that contains no '/' or '\0' characters) is
+ * produced by this process for the given sequence number, this
+ * function returns 1.  If the result is not valid, it returns 0.
+ * Returns -1 if the sequence number is beyond the the maximum for
+ * names of the given length.
+ *
+ *
+ * Discussion
+ * ----------
+ * The number of alternates available for a given name is dependent
+ * on its length.  A "bit flip" involves inverting two bits in
+ * a name--the two bits being selected such that their values
+ * affect the name's hash value in the same way.  Alternates are
+ * thus generated by inverting the value of pairs of such
+ * "overlapping" bits in the original name.  Each byte after the
+ * first in a name adds at least one bit of overlap to work with.
+ * (See comments above flip_bit() for more discussion on this.)
+ *
+ * So the number of alternates is dependent on the number of such
+ * overlapping bits in a name.  If there are N bit overlaps, there
+ * 2^N alternates for that hash value.
+ *
+ * Here are the number of overlapping bits available for generating
+ * alternates for names of specific lengths:
+ *	1	0	(must have 2 bytes to have any overlap)
+ *	2	1	One bit overlaps--so 2 possible alternates
+ *	3	2	Two bits overlap--so 4 possible alternates
+ *	4	4	Three bits overlap, so 2^3 alternates
+ *	5	8	8 bits overlap (due to wrapping), 256 alternates
+ *	6	18	2^18 alternates
+ *	7	28	2^28 alternates
+ *	   ...
+ * It's clear that the number of alternates grows very quickly with
+ * the length of the name.  But note that the set of alternates
+ * includes invalid names.  And for certain (contrived) names, the
+ * number of valid names is a fairly small fraction of the total
+ * number of alternates.
+ *
+ * The main driver for this infrastructure for coming up with
+ * alternate names is really related to names 5 (or possibly 6)
+ * bytes in length.  5-byte obfuscated names contain no randomly-
+ * generated bytes in them, and the chance of an obfuscated name
+ * matching an already-seen name is too high to just ignore.  This
+ * methodical selection of alternates ensures we don't produce
+ * duplicate names unless we have exhausted our options.
+ */
+int
+find_alternate(
+	size_t		name_len,
+	unsigned char	*name,
+	uint32_t	seq)
+{
+	uint32_t	bitseq = 0;
+	uint32_t	bits = seq;
+
+	if (!seq)
+		return 1;	/* alternate 0 is the original name */
+	if (name_len < 2)	/* Must have 2 bytes to flip */
+		return -1;
+
+	for (bitseq = 0; bits; bitseq++) {
+		uint32_t	mask = 1 << bitseq;
+		int		fb;
+
+		if (!(bits & mask))
+			continue;
+
+		fb = flip_bit(name_len, name, bitseq);
+		if (fb < 1)
+			return fb ? -1 : 0;
+		bits ^= mask;
+	}
+
+	return 1;
+}
diff --git a/db/obfuscate.h b/db/obfuscate.h
new file mode 100644
index 00000000000..afaaca37154
--- /dev/null
+++ b/db/obfuscate.h
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2007, 2011 SGI
+ * All Rights Reserved.
+ */
+#ifndef __DB_OBFUSCATE_H__
+#define __DB_OBFUSCATE_H__
+
+/* Routines to obfuscate directory filenames and xattr names. */
+
+#define is_invalid_char(c)	((c) == '/' || (c) == '\0')
+
+void obfuscate_name(xfs_dahash_t hash, size_t name_len, unsigned char *name,
+		bool is_dirent);
+int find_alternate(size_t name_len, unsigned char *name, uint32_t seq);
+
+#endif /* __DB_OBFUSCATE_H__ */

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 1/3] xfs_db: hoist name obfuscation code out of metadump.c
  2023-06-15 16:12   ` [PATCH v2 " Darrick J. Wong
@ 2023-06-22 11:55     ` Carlos Maiolino
  0 siblings, 0 replies; 11+ messages in thread
From: Carlos Maiolino @ 2023-06-22 11:55 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: linux-xfs

On Thu, Jun 15, 2023 at 09:12:06AM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <djwong@kernel.org>
> 
> We want to create a debugger command that will create obfuscated names
> for directory and xattr names, so hoist the name obfuscation code into a
> separate file.
> 
> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
> ---
> v2: rebase to deal with s/alloca/malloc change
> ---

Thanks!
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>

>  db/Makefile    |    2
>  db/metadump.c  |  388 -------------------------------------------------------
>  db/obfuscate.c |  393 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  db/obfuscate.h |   17 ++
>  4 files changed, 412 insertions(+), 388 deletions(-)
>  create mode 100644 db/obfuscate.c
>  create mode 100644 db/obfuscate.h
> 
> diff --git a/db/Makefile b/db/Makefile
> index b2e01174571..2f95f67075d 100644
> --- a/db/Makefile
> +++ b/db/Makefile
> @@ -13,7 +13,7 @@ HFILES = addr.h agf.h agfl.h agi.h attr.h attrshort.h bit.h block.h bmap.h \
>  	flist.h fprint.h frag.h freesp.h hash.h help.h init.h inode.h input.h \
>  	io.h logformat.h malloc.h metadump.h output.h print.h quit.h sb.h \
>  	sig.h strvec.h text.h type.h write.h attrset.h symlink.h fsmap.h \
> -	fuzz.h
> +	fuzz.h obfuscate.h
>  CFILES = $(HFILES:.h=.c) btdump.c btheight.c convert.c info.c namei.c \
>  	timelimit.c
>  LSRCFILES = xfs_admin.sh xfs_ncheck.sh xfs_metadump.sh
> diff --git a/db/metadump.c b/db/metadump.c
> index 9ccee0b7ace..d9a616a9296 100644
> --- a/db/metadump.c
> +++ b/db/metadump.c
> @@ -19,6 +19,7 @@
>  #include "faddr.h"
>  #include "field.h"
>  #include "dir2.h"
> +#include "obfuscate.h"
> 
>  #define DEFAULT_MAX_EXT_SIZE	XFS_MAX_BMBT_EXTLEN
> 
> @@ -736,19 +737,6 @@ nametable_add(xfs_dahash_t hash, int namelen, unsigned char *name)
>  	return ent;
>  }
> 
> -#define is_invalid_char(c)	((c) == '/' || (c) == '\0')
> -#define rol32(x,y)		(((x) << (y)) | ((x) >> (32 - (y))))
> -
> -static inline unsigned char
> -random_filename_char(void)
> -{
> -	static unsigned char filename_alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
> -						"abcdefghijklmnopqrstuvwxyz"
> -						"0123456789-_";
> -
> -	return filename_alphabet[random() % (sizeof filename_alphabet - 1)];
> -}
> -
>  #define	ORPHANAGE	"lost+found"
>  #define	ORPHANAGE_LEN	(sizeof (ORPHANAGE) - 1)
> 
> @@ -808,380 +796,6 @@ in_lost_found(
>  	return slen == namelen && !memcmp(name, s, namelen);
>  }
> 
> -/*
> - * Given a name and its hash value, massage the name in such a way
> - * that the result is another name of equal length which shares the
> - * same hash value.
> - */
> -static void
> -obfuscate_name(
> -	xfs_dahash_t	hash,
> -	size_t		name_len,
> -	unsigned char	*name,
> -	bool		is_dirent)
> -{
> -	unsigned char	*oldname = NULL;
> -	unsigned char	*newp;
> -	int		i;
> -	xfs_dahash_t	new_hash;
> -	unsigned char	*first;
> -	unsigned char	high_bit;
> -	int		tries = 0;
> -	bool		is_ci_name = is_dirent && xfs_has_asciici(mp);
> -	int		shift;
> -
> -	/*
> -	 * Our obfuscation algorithm requires at least 5-character
> -	 * names, so don't bother if the name is too short.  We
> -	 * work backward from a hash value to determine the last
> -	 * five bytes in a name required to produce a new name
> -	 * with the same hash.
> -	 */
> -	if (name_len < 5)
> -		return;
> -
> -	if (is_ci_name) {
> -		oldname = malloc(name_len);
> -		if (!oldname)
> -			return;
> -		memcpy(oldname, name, name_len);
> -	}
> -
> -again:
> -	newp = name;
> -	new_hash = 0;
> -
> -	/*
> -	 * If we cannot generate a ci-compatible obfuscated name after 1000
> -	 * tries, don't bother obfuscating the name.
> -	 */
> -	if (tries++ > 1000) {
> -		memcpy(name, oldname, name_len);
> -		goto out_free;
> -	}
> -
> -	/*
> -	 * The beginning of the obfuscated name can be pretty much
> -	 * anything, so fill it in with random characters.
> -	 * Accumulate its new hash value as we go.
> -	 */
> -	for (i = 0; i < name_len - 5; i++) {
> -		*newp = random_filename_char();
> -		if (is_ci_name)
> -			new_hash = xfs_ascii_ci_xfrm(*newp) ^
> -							rol32(new_hash, 7);
> -		else
> -			new_hash = *newp ^ rol32(new_hash, 7);
> -		newp++;
> -	}
> -
> -	/*
> -	 * Compute which five bytes need to be used at the end of
> -	 * the name so the hash of the obfuscated name is the same
> -	 * as the hash of the original.  If any result in an invalid
> -	 * character, flip a bit and arrange for a corresponding bit
> -	 * in a neighboring byte to be flipped as well.  For the
> -	 * last byte, the "neighbor" to change is the first byte
> -	 * we're computing here.
> -	 */
> -	new_hash = rol32(new_hash, 3) ^ hash;
> -
> -	first = newp;
> -	high_bit = 0;
> -	for (shift = 28; shift >= 0; shift -= 7) {
> -		*newp = (new_hash >> shift & 0x7f) ^ high_bit;
> -		if (is_invalid_char(*newp)) {
> -			*newp ^= 1;
> -			high_bit = 0x80;
> -		} else
> -			high_bit = 0;
> -
> -		/*
> -		 * If ascii-ci is enabled, uppercase characters are converted
> -		 * to lowercase characters while computing the name hash.  If
> -		 * any of the necessary correction bytes are uppercase, the
> -		 * hash of the new name will not match.  Try again with a
> -		 * different prefix.
> -		 */
> -		if (is_ci_name && xfs_ascii_ci_need_xfrm(*newp))
> -			goto again;
> -
> -		ASSERT(!is_invalid_char(*newp));
> -		newp++;
> -	}
> -
> -	/*
> -	 * If we flipped a bit on the last byte, we need to fix up
> -	 * the matching bit in the first byte.  The result will
> -	 * be a valid character, because we know that first byte
> -	 * has 0's in its upper four bits (it was produced by a
> -	 * 28-bit right-shift of a 32-bit unsigned value).
> -	 */
> -	if (high_bit) {
> -		*first ^= 0x10;
> -
> -		if (is_ci_name && xfs_ascii_ci_need_xfrm(*first))
> -			goto again;
> -
> -		ASSERT(!is_invalid_char(*first));
> -	}
> -
> -out_free:
> -	free(oldname);
> -}
> -
> -/*
> - * Flip a bit in each of two bytes at the end of the given name.
> - * This is used in generating a series of alternate names to be used
> - * in the event a duplicate is found.
> - *
> - * The bits flipped are selected such that they both affect the same
> - * bit in the name's computed hash value, so flipping them both will
> - * preserve the hash.
> - *
> - * The following diagram aims to show the portion of a computed
> - * hash that a given byte of a name affects.
> - *
> - *	   31    28      24    21	     14		  8 7       3     0
> - *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
> - * hash:   | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
> - *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
> - *	  last-4 ->|	       |<-- last-2 --->|	   |<--- last ---->|
> - *		 |<-- last-3 --->|	     |<-- last-1 --->|     |<- last-4
> - *			 |<-- last-7 --->|	     |<-- last-5 --->|
> - *	   |<-- last-8 --->|	       |<-- last-6 --->|
> - *			. . . and so on
> - *
> - * The last byte of the name directly affects the low-order byte of
> - * the hash.  The next-to-last affects bits 7-14, the next one back
> - * affects bits 14-21, and so on.  The effect wraps around when it
> - * goes beyond the top of the hash (as happens for byte last-4).
> - *
> - * Bits that are flipped together "overlap" on the hash value.  As
> - * an example of overlap, the last two bytes both affect bit 7 in
> - * the hash.  That pair of bytes (and their overlapping bits) can be
> - * used for this "flip bit" operation (it's the first pair tried,
> - * actually).
> - *
> - * A table defines overlapping pairs--the bytes involved and bits
> - * within them--that can be used this way.  The byte offset is
> - * relative to a starting point within the name, which will be set
> - * to affect the bytes at the end of the name.  The function is
> - * called with a "bitseq" value which indicates which bit flip is
> - * desired, and this translates directly into selecting which entry
> - * in the bit_to_flip[] table to apply.
> - *
> - * The function returns 1 if the operation was successful.  It
> - * returns 0 if the result produced a character that's not valid in
> - * a name (either '/' or a '\0').  Finally, it returns -1 if the bit
> - * sequence number is beyond what is supported for a name of this
> - * length.
> - *
> - * Discussion
> - * ----------
> - * (Also see the discussion above find_alternate(), below.)
> - *
> - * In order to make this function work for any length name, the
> - * table is ordered by increasing byte offset, so that the earliest
> - * entries can apply to the shortest strings.  This way all names
> - * are done consistently.
> - *
> - * When bit flips occur, they can convert printable characters
> - * into non-printable ones.  In an effort to reduce the impact of
> - * this, the first bit flips are chosen to affect bytes the end of
> - * the name (and furthermore, toward the low bits of a byte).  Those
> - * bytes are often non-printable anyway because of the way they are
> - * initially selected by obfuscate_name()).  This is accomplished,
> - * using later table entries first.
> - *
> - * Each row in the table doubles the number of alternates that
> - * can be generated.  A two-byte name is limited to using only
> - * the first row, so it's possible to generate two alternates
> - * (the original name, plus the alternate produced by flipping
> - * the one pair of bits).  In a 5-byte name, the effect of the
> - * first byte overlaps the last by 4 its, and there are 8 bits
> - * to flip, allowing for 256 possible alternates.
> - *
> - * Short names (less than 5 bytes) are never even obfuscated, so for
> - * such names the relatively small number of alternates should never
> - * really be a problem.
> - *
> - * Long names (more than 6 bytes, say) are not likely to exhaust
> - * the number of available alternates.  In fact, the table could
> - * probably have stopped at 8 entries, on the assumption that 256
> - * alternates should be enough for most any situation.  The entries
> - * beyond those are present mostly for demonstration of how it could
> - * be populated with more entries, should it ever be necessary to do
> - * so.
> - */
> -static int
> -flip_bit(
> -	size_t		name_len,
> -	unsigned char	*name,
> -	uint32_t	bitseq)
> -{
> -	int	index;
> -	size_t	offset;
> -	unsigned char *p0, *p1;
> -	unsigned char m0, m1;
> -	struct {
> -	    int		byte;	/* Offset from start within name */
> -	    unsigned char bit;	/* Bit within that byte */
> -	} bit_to_flip[][2] = {	/* Sorted by second entry's byte */
> -	    { { 0, 0 }, { 1, 7 } },	/* Each row defines a pair */
> -	    { { 1, 0 }, { 2, 7 } },	/* of bytes and a bit within */
> -	    { { 2, 0 }, { 3, 7 } },	/* each byte.  Each bit in */
> -	    { { 0, 4 }, { 4, 0 } },	/* a pair affects the same */
> -	    { { 0, 5 }, { 4, 1 } },	/* bit in the hash, so flipping */
> -	    { { 0, 6 }, { 4, 2 } },	/* both will change the name */
> -	    { { 0, 7 }, { 4, 3 } },	/* while preserving the hash. */
> -	    { { 3, 0 }, { 4, 7 } },
> -	    { { 0, 0 }, { 5, 3 } },	/* The first entry's byte offset */
> -	    { { 0, 1 }, { 5, 4 } },	/* must be less than the second. */
> -	    { { 0, 2 }, { 5, 5 } },
> -	    { { 0, 3 }, { 5, 6 } },	/* The table can be extended to */
> -	    { { 0, 4 }, { 5, 7 } },	/* an arbitrary number of entries */
> -	    { { 4, 0 }, { 5, 7 } },	/* but there's not much point. */
> -		/* . . . */
> -	};
> -
> -	/* Find the first entry *not* usable for name of this length */
> -
> -	for (index = 0; index < ARRAY_SIZE(bit_to_flip); index++)
> -		if (bit_to_flip[index][1].byte >= name_len)
> -			break;
> -
> -	/*
> -	 * Back up to the last usable entry.  If that number is
> -	 * smaller than the bit sequence number, inform the caller
> -	 * that nothing this large (or larger) will work.
> -	 */
> -	if (bitseq > --index)
> -		return -1;
> -
> -	/*
> -	 * We will be switching bits at the end of name, with a
> -	 * preference for affecting the last bytes first.  Compute
> -	 * where in the name we'll start applying the changes.
> -	 */
> -	offset = name_len - (bit_to_flip[index][1].byte + 1);
> -	index -= bitseq;	/* Use later table entries first */
> -
> -	p0 = name + offset + bit_to_flip[index][0].byte;
> -	p1 = name + offset + bit_to_flip[index][1].byte;
> -	m0 = 1 << bit_to_flip[index][0].bit;
> -	m1 = 1 << bit_to_flip[index][1].bit;
> -
> -	/* Only change the bytes if it produces valid characters */
> -
> -	if (is_invalid_char(*p0 ^ m0) || is_invalid_char(*p1 ^ m1))
> -		return 0;
> -
> -	*p0 ^= m0;
> -	*p1 ^= m1;
> -
> -	return 1;
> -}
> -
> -/*
> - * This function generates a well-defined sequence of "alternate"
> - * names for a given name.  An alternate is a name having the same
> - * length and same hash value as the original name.  This is needed
> - * because the algorithm produces only one obfuscated name to use
> - * for a given original name, and it's possible that result matches
> - * a name already seen.  This function checks for this, and if it
> - * occurs, finds another suitable obfuscated name to use.
> - *
> - * Each bit in the binary representation of the sequence number is
> - * used to select one possible "bit flip" operation to perform on
> - * the name.  So for example:
> - *    seq = 0:	selects no bits to flip
> - *    seq = 1:	selects the 0th bit to flip
> - *    seq = 2:	selects the 1st bit to flip
> - *    seq = 3:	selects the 0th and 1st bit to flip
> - *    ... and so on.
> - *
> - * The flip_bit() function takes care of the details of the bit
> - * flipping within the name.  Note that the "1st bit" in this
> - * context is a bit sequence number; i.e. it doesn't necessarily
> - * mean bit 0x02 will be changed.
> - *
> - * If a valid name (one that contains no '/' or '\0' characters) is
> - * produced by this process for the given sequence number, this
> - * function returns 1.  If the result is not valid, it returns 0.
> - * Returns -1 if the sequence number is beyond the the maximum for
> - * names of the given length.
> - *
> - *
> - * Discussion
> - * ----------
> - * The number of alternates available for a given name is dependent
> - * on its length.  A "bit flip" involves inverting two bits in
> - * a name--the two bits being selected such that their values
> - * affect the name's hash value in the same way.  Alternates are
> - * thus generated by inverting the value of pairs of such
> - * "overlapping" bits in the original name.  Each byte after the
> - * first in a name adds at least one bit of overlap to work with.
> - * (See comments above flip_bit() for more discussion on this.)
> - *
> - * So the number of alternates is dependent on the number of such
> - * overlapping bits in a name.  If there are N bit overlaps, there
> - * 2^N alternates for that hash value.
> - *
> - * Here are the number of overlapping bits available for generating
> - * alternates for names of specific lengths:
> - *	1	0	(must have 2 bytes to have any overlap)
> - *	2	1	One bit overlaps--so 2 possible alternates
> - *	3	2	Two bits overlap--so 4 possible alternates
> - *	4	4	Three bits overlap, so 2^3 alternates
> - *	5	8	8 bits overlap (due to wrapping), 256 alternates
> - *	6	18	2^18 alternates
> - *	7	28	2^28 alternates
> - *	   ...
> - * It's clear that the number of alternates grows very quickly with
> - * the length of the name.  But note that the set of alternates
> - * includes invalid names.  And for certain (contrived) names, the
> - * number of valid names is a fairly small fraction of the total
> - * number of alternates.
> - *
> - * The main driver for this infrastructure for coming up with
> - * alternate names is really related to names 5 (or possibly 6)
> - * bytes in length.  5-byte obfuscated names contain no randomly-
> - * generated bytes in them, and the chance of an obfuscated name
> - * matching an already-seen name is too high to just ignore.  This
> - * methodical selection of alternates ensures we don't produce
> - * duplicate names unless we have exhausted our options.
> - */
> -static int
> -find_alternate(
> -	size_t		name_len,
> -	unsigned char	*name,
> -	uint32_t	seq)
> -{
> -	uint32_t	bitseq = 0;
> -	uint32_t	bits = seq;
> -
> -	if (!seq)
> -		return 1;	/* alternate 0 is the original name */
> -	if (name_len < 2)	/* Must have 2 bytes to flip */
> -		return -1;
> -
> -	for (bitseq = 0; bits; bitseq++) {
> -		uint32_t	mask = 1 << bitseq;
> -		int		fb;
> -
> -		if (!(bits & mask))
> -			continue;
> -
> -		fb = flip_bit(name_len, name, bitseq);
> -		if (fb < 1)
> -			return fb ? -1 : 0;
> -		bits ^= mask;
> -	}
> -
> -	return 1;
> -}
> -
>  /*
>   * Look up the given name in the name table.  If it is already
>   * present, iterate through a well-defined sequence of alternate
> diff --git a/db/obfuscate.c b/db/obfuscate.c
> new file mode 100644
> index 00000000000..cd950b44581
> --- /dev/null
> +++ b/db/obfuscate.c
> @@ -0,0 +1,393 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2007, 2011 SGI
> + * All Rights Reserved.
> + */
> +#include "libxfs.h"
> +#include "init.h"
> +#include "obfuscate.h"
> +
> +static inline unsigned char
> +random_filename_char(void)
> +{
> +	static unsigned char filename_alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
> +						"abcdefghijklmnopqrstuvwxyz"
> +						"0123456789-_";
> +
> +	return filename_alphabet[random() % (sizeof filename_alphabet - 1)];
> +}
> +
> +#define rol32(x,y)		(((x) << (y)) | ((x) >> (32 - (y))))
> +
> +/*
> + * Given a name and its hash value, massage the name in such a way
> + * that the result is another name of equal length which shares the
> + * same hash value.
> + */
> +void
> +obfuscate_name(
> +	xfs_dahash_t	hash,
> +	size_t		name_len,
> +	unsigned char	*name,
> +	bool		is_dirent)
> +{
> +	unsigned char	*oldname = NULL;
> +	unsigned char	*newp;
> +	int		i;
> +	xfs_dahash_t	new_hash;
> +	unsigned char	*first;
> +	unsigned char	high_bit;
> +	int		tries = 0;
> +	bool		is_ci_name = is_dirent && xfs_has_asciici(mp);
> +	int		shift;
> +
> +	/*
> +	 * Our obfuscation algorithm requires at least 5-character
> +	 * names, so don't bother if the name is too short.  We
> +	 * work backward from a hash value to determine the last
> +	 * five bytes in a name required to produce a new name
> +	 * with the same hash.
> +	 */
> +	if (name_len < 5)
> +		return;
> +
> +	if (is_ci_name) {
> +		oldname = malloc(name_len);
> +		if (!oldname)
> +			return;
> +		memcpy(oldname, name, name_len);
> +	}
> +
> +again:
> +	newp = name;
> +	new_hash = 0;
> +
> +	/*
> +	 * If we cannot generate a ci-compatible obfuscated name after 1000
> +	 * tries, don't bother obfuscating the name.
> +	 */
> +	if (tries++ > 1000) {
> +		memcpy(name, oldname, name_len);
> +		goto out_free;
> +	}
> +
> +	/*
> +	 * The beginning of the obfuscated name can be pretty much
> +	 * anything, so fill it in with random characters.
> +	 * Accumulate its new hash value as we go.
> +	 */
> +	for (i = 0; i < name_len - 5; i++) {
> +		*newp = random_filename_char();
> +		if (is_ci_name)
> +			new_hash = xfs_ascii_ci_xfrm(*newp) ^
> +							rol32(new_hash, 7);
> +		else
> +			new_hash = *newp ^ rol32(new_hash, 7);
> +		newp++;
> +	}
> +
> +	/*
> +	 * Compute which five bytes need to be used at the end of
> +	 * the name so the hash of the obfuscated name is the same
> +	 * as the hash of the original.  If any result in an invalid
> +	 * character, flip a bit and arrange for a corresponding bit
> +	 * in a neighboring byte to be flipped as well.  For the
> +	 * last byte, the "neighbor" to change is the first byte
> +	 * we're computing here.
> +	 */
> +	new_hash = rol32(new_hash, 3) ^ hash;
> +
> +	first = newp;
> +	high_bit = 0;
> +	for (shift = 28; shift >= 0; shift -= 7) {
> +		*newp = (new_hash >> shift & 0x7f) ^ high_bit;
> +		if (is_invalid_char(*newp)) {
> +			*newp ^= 1;
> +			high_bit = 0x80;
> +		} else
> +			high_bit = 0;
> +
> +		/*
> +		 * If ascii-ci is enabled, uppercase characters are converted
> +		 * to lowercase characters while computing the name hash.  If
> +		 * any of the necessary correction bytes are uppercase, the
> +		 * hash of the new name will not match.  Try again with a
> +		 * different prefix.
> +		 */
> +		if (is_ci_name && xfs_ascii_ci_need_xfrm(*newp))
> +			goto again;
> +
> +		ASSERT(!is_invalid_char(*newp));
> +		newp++;
> +	}
> +
> +	/*
> +	 * If we flipped a bit on the last byte, we need to fix up
> +	 * the matching bit in the first byte.  The result will
> +	 * be a valid character, because we know that first byte
> +	 * has 0's in its upper four bits (it was produced by a
> +	 * 28-bit right-shift of a 32-bit unsigned value).
> +	 */
> +	if (high_bit) {
> +		*first ^= 0x10;
> +
> +		if (is_ci_name && xfs_ascii_ci_need_xfrm(*first))
> +			goto again;
> +
> +		ASSERT(!is_invalid_char(*first));
> +	}
> +out_free:
> +	free(oldname);
> +}
> +
> +/*
> + * Flip a bit in each of two bytes at the end of the given name.
> + * This is used in generating a series of alternate names to be used
> + * in the event a duplicate is found.
> + *
> + * The bits flipped are selected such that they both affect the same
> + * bit in the name's computed hash value, so flipping them both will
> + * preserve the hash.
> + *
> + * The following diagram aims to show the portion of a computed
> + * hash that a given byte of a name affects.
> + *
> + *	   31    28      24    21	     14		  8 7       3     0
> + *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
> + * hash:   | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
> + *	   +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
> + *	  last-4 ->|	       |<-- last-2 --->|	   |<--- last ---->|
> + *		 |<-- last-3 --->|	     |<-- last-1 --->|     |<- last-4
> + *			 |<-- last-7 --->|	     |<-- last-5 --->|
> + *	   |<-- last-8 --->|	       |<-- last-6 --->|
> + *			. . . and so on
> + *
> + * The last byte of the name directly affects the low-order byte of
> + * the hash.  The next-to-last affects bits 7-14, the next one back
> + * affects bits 14-21, and so on.  The effect wraps around when it
> + * goes beyond the top of the hash (as happens for byte last-4).
> + *
> + * Bits that are flipped together "overlap" on the hash value.  As
> + * an example of overlap, the last two bytes both affect bit 7 in
> + * the hash.  That pair of bytes (and their overlapping bits) can be
> + * used for this "flip bit" operation (it's the first pair tried,
> + * actually).
> + *
> + * A table defines overlapping pairs--the bytes involved and bits
> + * within them--that can be used this way.  The byte offset is
> + * relative to a starting point within the name, which will be set
> + * to affect the bytes at the end of the name.  The function is
> + * called with a "bitseq" value which indicates which bit flip is
> + * desired, and this translates directly into selecting which entry
> + * in the bit_to_flip[] table to apply.
> + *
> + * The function returns 1 if the operation was successful.  It
> + * returns 0 if the result produced a character that's not valid in
> + * a name (either '/' or a '\0').  Finally, it returns -1 if the bit
> + * sequence number is beyond what is supported for a name of this
> + * length.
> + *
> + * Discussion
> + * ----------
> + * (Also see the discussion above find_alternate(), below.)
> + *
> + * In order to make this function work for any length name, the
> + * table is ordered by increasing byte offset, so that the earliest
> + * entries can apply to the shortest strings.  This way all names
> + * are done consistently.
> + *
> + * When bit flips occur, they can convert printable characters
> + * into non-printable ones.  In an effort to reduce the impact of
> + * this, the first bit flips are chosen to affect bytes the end of
> + * the name (and furthermore, toward the low bits of a byte).  Those
> + * bytes are often non-printable anyway because of the way they are
> + * initially selected by obfuscate_name()).  This is accomplished,
> + * using later table entries first.
> + *
> + * Each row in the table doubles the number of alternates that
> + * can be generated.  A two-byte name is limited to using only
> + * the first row, so it's possible to generate two alternates
> + * (the original name, plus the alternate produced by flipping
> + * the one pair of bits).  In a 5-byte name, the effect of the
> + * first byte overlaps the last by 4 its, and there are 8 bits
> + * to flip, allowing for 256 possible alternates.
> + *
> + * Short names (less than 5 bytes) are never even obfuscated, so for
> + * such names the relatively small number of alternates should never
> + * really be a problem.
> + *
> + * Long names (more than 6 bytes, say) are not likely to exhaust
> + * the number of available alternates.  In fact, the table could
> + * probably have stopped at 8 entries, on the assumption that 256
> + * alternates should be enough for most any situation.  The entries
> + * beyond those are present mostly for demonstration of how it could
> + * be populated with more entries, should it ever be necessary to do
> + * so.
> + */
> +static int
> +flip_bit(
> +	size_t		name_len,
> +	unsigned char	*name,
> +	uint32_t	bitseq)
> +{
> +	int	index;
> +	size_t	offset;
> +	unsigned char *p0, *p1;
> +	unsigned char m0, m1;
> +	struct {
> +	    int		byte;	/* Offset from start within name */
> +	    unsigned char bit;	/* Bit within that byte */
> +	} bit_to_flip[][2] = {	/* Sorted by second entry's byte */
> +	    { { 0, 0 }, { 1, 7 } },	/* Each row defines a pair */
> +	    { { 1, 0 }, { 2, 7 } },	/* of bytes and a bit within */
> +	    { { 2, 0 }, { 3, 7 } },	/* each byte.  Each bit in */
> +	    { { 0, 4 }, { 4, 0 } },	/* a pair affects the same */
> +	    { { 0, 5 }, { 4, 1 } },	/* bit in the hash, so flipping */
> +	    { { 0, 6 }, { 4, 2 } },	/* both will change the name */
> +	    { { 0, 7 }, { 4, 3 } },	/* while preserving the hash. */
> +	    { { 3, 0 }, { 4, 7 } },
> +	    { { 0, 0 }, { 5, 3 } },	/* The first entry's byte offset */
> +	    { { 0, 1 }, { 5, 4 } },	/* must be less than the second. */
> +	    { { 0, 2 }, { 5, 5 } },
> +	    { { 0, 3 }, { 5, 6 } },	/* The table can be extended to */
> +	    { { 0, 4 }, { 5, 7 } },	/* an arbitrary number of entries */
> +	    { { 4, 0 }, { 5, 7 } },	/* but there's not much point. */
> +		/* . . . */
> +	};
> +
> +	/* Find the first entry *not* usable for name of this length */
> +
> +	for (index = 0; index < ARRAY_SIZE(bit_to_flip); index++)
> +		if (bit_to_flip[index][1].byte >= name_len)
> +			break;
> +
> +	/*
> +	 * Back up to the last usable entry.  If that number is
> +	 * smaller than the bit sequence number, inform the caller
> +	 * that nothing this large (or larger) will work.
> +	 */
> +	if (bitseq > --index)
> +		return -1;
> +
> +	/*
> +	 * We will be switching bits at the end of name, with a
> +	 * preference for affecting the last bytes first.  Compute
> +	 * where in the name we'll start applying the changes.
> +	 */
> +	offset = name_len - (bit_to_flip[index][1].byte + 1);
> +	index -= bitseq;	/* Use later table entries first */
> +
> +	p0 = name + offset + bit_to_flip[index][0].byte;
> +	p1 = name + offset + bit_to_flip[index][1].byte;
> +	m0 = 1 << bit_to_flip[index][0].bit;
> +	m1 = 1 << bit_to_flip[index][1].bit;
> +
> +	/* Only change the bytes if it produces valid characters */
> +
> +	if (is_invalid_char(*p0 ^ m0) || is_invalid_char(*p1 ^ m1))
> +		return 0;
> +
> +	*p0 ^= m0;
> +	*p1 ^= m1;
> +
> +	return 1;
> +}
> +
> +/*
> + * This function generates a well-defined sequence of "alternate"
> + * names for a given name.  An alternate is a name having the same
> + * length and same hash value as the original name.  This is needed
> + * because the algorithm produces only one obfuscated name to use
> + * for a given original name, and it's possible that result matches
> + * a name already seen.  This function checks for this, and if it
> + * occurs, finds another suitable obfuscated name to use.
> + *
> + * Each bit in the binary representation of the sequence number is
> + * used to select one possible "bit flip" operation to perform on
> + * the name.  So for example:
> + *    seq = 0:	selects no bits to flip
> + *    seq = 1:	selects the 0th bit to flip
> + *    seq = 2:	selects the 1st bit to flip
> + *    seq = 3:	selects the 0th and 1st bit to flip
> + *    ... and so on.
> + *
> + * The flip_bit() function takes care of the details of the bit
> + * flipping within the name.  Note that the "1st bit" in this
> + * context is a bit sequence number; i.e. it doesn't necessarily
> + * mean bit 0x02 will be changed.
> + *
> + * If a valid name (one that contains no '/' or '\0' characters) is
> + * produced by this process for the given sequence number, this
> + * function returns 1.  If the result is not valid, it returns 0.
> + * Returns -1 if the sequence number is beyond the the maximum for
> + * names of the given length.
> + *
> + *
> + * Discussion
> + * ----------
> + * The number of alternates available for a given name is dependent
> + * on its length.  A "bit flip" involves inverting two bits in
> + * a name--the two bits being selected such that their values
> + * affect the name's hash value in the same way.  Alternates are
> + * thus generated by inverting the value of pairs of such
> + * "overlapping" bits in the original name.  Each byte after the
> + * first in a name adds at least one bit of overlap to work with.
> + * (See comments above flip_bit() for more discussion on this.)
> + *
> + * So the number of alternates is dependent on the number of such
> + * overlapping bits in a name.  If there are N bit overlaps, there
> + * 2^N alternates for that hash value.
> + *
> + * Here are the number of overlapping bits available for generating
> + * alternates for names of specific lengths:
> + *	1	0	(must have 2 bytes to have any overlap)
> + *	2	1	One bit overlaps--so 2 possible alternates
> + *	3	2	Two bits overlap--so 4 possible alternates
> + *	4	4	Three bits overlap, so 2^3 alternates
> + *	5	8	8 bits overlap (due to wrapping), 256 alternates
> + *	6	18	2^18 alternates
> + *	7	28	2^28 alternates
> + *	   ...
> + * It's clear that the number of alternates grows very quickly with
> + * the length of the name.  But note that the set of alternates
> + * includes invalid names.  And for certain (contrived) names, the
> + * number of valid names is a fairly small fraction of the total
> + * number of alternates.
> + *
> + * The main driver for this infrastructure for coming up with
> + * alternate names is really related to names 5 (or possibly 6)
> + * bytes in length.  5-byte obfuscated names contain no randomly-
> + * generated bytes in them, and the chance of an obfuscated name
> + * matching an already-seen name is too high to just ignore.  This
> + * methodical selection of alternates ensures we don't produce
> + * duplicate names unless we have exhausted our options.
> + */
> +int
> +find_alternate(
> +	size_t		name_len,
> +	unsigned char	*name,
> +	uint32_t	seq)
> +{
> +	uint32_t	bitseq = 0;
> +	uint32_t	bits = seq;
> +
> +	if (!seq)
> +		return 1;	/* alternate 0 is the original name */
> +	if (name_len < 2)	/* Must have 2 bytes to flip */
> +		return -1;
> +
> +	for (bitseq = 0; bits; bitseq++) {
> +		uint32_t	mask = 1 << bitseq;
> +		int		fb;
> +
> +		if (!(bits & mask))
> +			continue;
> +
> +		fb = flip_bit(name_len, name, bitseq);
> +		if (fb < 1)
> +			return fb ? -1 : 0;
> +		bits ^= mask;
> +	}
> +
> +	return 1;
> +}
> diff --git a/db/obfuscate.h b/db/obfuscate.h
> new file mode 100644
> index 00000000000..afaaca37154
> --- /dev/null
> +++ b/db/obfuscate.h
> @@ -0,0 +1,17 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2007, 2011 SGI
> + * All Rights Reserved.
> + */
> +#ifndef __DB_OBFUSCATE_H__
> +#define __DB_OBFUSCATE_H__
> +
> +/* Routines to obfuscate directory filenames and xattr names. */
> +
> +#define is_invalid_char(c)	((c) == '/' || (c) == '\0')
> +
> +void obfuscate_name(xfs_dahash_t hash, size_t name_len, unsigned char *name,
> +		bool is_dirent);
> +int find_alternate(size_t name_len, unsigned char *name, uint32_t seq);
> +
> +#endif /* __DB_OBFUSCATE_H__ */

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2023-06-22 11:55 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-06-05 15:36 [PATCHSET 0/3] xfs_db: create names with colliding hashes Darrick J. Wong
2023-06-05 15:37 ` [PATCH 1/3] xfs_db: hoist name obfuscation code out of metadump.c Darrick J. Wong
2023-06-06 11:17   ` Carlos Maiolino
2023-06-15 16:12   ` [PATCH v2 " Darrick J. Wong
2023-06-22 11:55     ` Carlos Maiolino
2023-06-05 15:37 ` [PATCH 2/3] xfs_db: create dirents and xattrs with colliding names Darrick J. Wong
2023-06-06 11:33   ` Carlos Maiolino
2023-06-14 11:32   ` Carlos Maiolino
2023-06-05 15:37 ` [PATCH 3/3] xfs_db: make the hash command print the dirent hash Darrick J. Wong
2023-06-06 11:36   ` Carlos Maiolino
2023-06-05 20:22 ` [PATCHSET 0/3] xfs_db: create names with colliding hashes Andrey Albershteyn

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox