Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
@ 2012-03-02 23:46 Linus Torvalds
  2012-03-03  0:02 ` Ted Ts'o
                   ` (3 more replies)
  0 siblings, 4 replies; 26+ messages in thread
From: Linus Torvalds @ 2012-03-02 23:46 UTC (permalink / raw)
  To: Andi Kleen, H. Peter Anvin
  Cc: Linux Kernel Mailing List, linux-fsdevel, Al Viro


Here's a new version of that patch.

It's based n the cleanups I committed and pushed out today, so the base 
may not look familiar, but the upside is that this does the configuration 
automatically (currently the patch enables the word accesses on x86 by 
default as long as DEBUG_PAGEALLOC isn't set).

I worked around my problems with stupid branch prediction on the '/' test 
at the end by just reorganizing the code a bit, and it actually all just 
looks cleaner.

This *does* assume that "bsf" is a reasonably fast instruction, which is 
not necessarily the case especially on 32-bit x86. So the config option 
choice for this might want some tuning even on x86, but it would be lovely 
to get comments and have people test it out on older hardware.

NOTE! There's a fair number of users of "full_name_hash()", and I'm a bit 
nervous about the fact that now "full_name_hash()" does *not* match the 
concept of doing

    hash = name_hash_init();
    .. for each char .. hash = partial_name_hash(c, hash);
    return end_name_hash()

but the usage does appear to be non-overlapping. The pure number of 
full_name_hash() users surprises me a bit, though. There are lots of 
people doing that odd "create qstr and then do a 'd_lookup()' on the 
result". 

Anyway, things work for me, and this generates pretty code and seems to 
perform quite well too.

Comments?

                  Linus

---
 arch/x86/Kconfig       |    1 +
 fs/Kconfig             |    4 +++
 fs/namei.c             |   76 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dcache.h |   23 +++++++++++++++
 4 files changed, 104 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5bed94e189fa..09675d3e0ac3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -82,6 +82,7 @@ config X86
 	select CLKEVT_I8253
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_IOMAP
+	select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
diff --git a/fs/Kconfig b/fs/Kconfig
index d621f02a3f9e..aa195265362f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -4,6 +4,10 @@
 
 menu "File systems"
 
+# Use unaligned word dcache accesses
+config DCACHE_WORD_ACCESS
+       bool
+
 if BLOCK
 
 source "fs/ext2/Kconfig"
diff --git a/fs/namei.c b/fs/namei.c
index 71807dc7e402..258ef22783e1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1374,6 +1374,80 @@ static inline int can_lookup(struct inode *inode)
 	return 1;
 }
 
+/* Only works for little-endian with fast unaligned accesses! */
+#ifdef CONFIG_DCACHE_WORD_ACCESS
+
+static inline unsigned int fold_hash(unsigned long hash)
+{
+#ifdef CONFIG_64BIT
+	hash += hash >> (8*sizeof(int));
+#endif
+	return hash;
+}
+
+unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+{
+	unsigned long a, mask;
+	unsigned long hash = 0;
+
+	for (;;) {
+		a = *(unsigned long *)name;
+		hash *= 9;
+		if (len < sizeof(unsigned long))
+			break;
+		hash += a;
+		name += sizeof(unsigned long);
+		len -= sizeof(unsigned long);
+		if (!len)
+			goto done;
+	}
+	mask = ~(~0ul << len*8);
+	hash += mask & a;
+done:
+	return fold_hash(hash);
+}
+
+#define ONEBYTES	0x0101010101010101ul
+#define SLASHBYTES	0x2f2f2f2f2f2f2f2ful
+#define HIGHBITS	0x8080808080808080ul
+
+/* Return the high bit set in the first byte that is a zero */
+static inline unsigned long has_zero(unsigned long a)
+{
+	return ((a - ONEBYTES) & ~a) & HIGHBITS;
+}
+
+/*
+ * Calculate the length and hash of the path component, and
+ * return the length of the component;
+ */
+static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+{
+	unsigned long a, mask, hash, len;
+
+	hash = a = 0;
+	len = -sizeof(unsigned long);
+	do {
+		hash = (hash + a) * 9;
+		len += sizeof(unsigned long);
+		a = *(unsigned long *)(name+len);
+		/* Do we have any NUL or '/' bytes in this word? */
+		mask = has_zero(a) | has_zero(a ^ SLASHBYTES);
+	} while (!mask);
+
+	/* Get the final path component length */
+	len += __ffs(mask) >> 3;
+
+	/* The mask *below* the first high bit set */
+	mask = (mask - 1) & ~mask;
+	mask >>= 7;
+	hash += a & mask;
+	*hashp = fold_hash(hash);
+	return len;
+}
+
+#else
+
 unsigned int full_name_hash(const unsigned char *name, unsigned int len)
 {
 	unsigned long hash = init_name_hash();
@@ -1401,6 +1475,8 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
 	return len;
 }
 
+#endif
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 4270bedd2308..6a35eaf5f3f2 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -54,6 +54,28 @@ extern struct dentry_stat_t dentry_stat;
 static inline int dentry_cmp(const unsigned char *cs, size_t scount,
 				const unsigned char *ct, size_t tcount)
 {
+#ifdef CONFIG_DCACHE_WORD_ACCESS
+	unsigned long a,b,mask;
+
+	if (unlikely(scount != tcount))
+		return 1;
+
+	for (;;) {
+		a = *(unsigned long *)cs;
+		b = *(unsigned long *)ct;
+		if (tcount < sizeof(unsigned long))
+			break;
+		if (unlikely(a != b))
+			return 1;
+		cs += sizeof(unsigned long);
+		ct += sizeof(unsigned long);
+		tcount -= sizeof(unsigned long);
+		if (!tcount)
+			return 0;
+	}
+	mask = ~(~0ul << tcount*8);
+	return unlikely(!!((a ^ b) & mask));
+#else
 	if (scount != tcount)
 		return 1;
 
@@ -65,6 +87,7 @@ static inline int dentry_cmp(const unsigned char *cs, size_t scount,
 		tcount--;
 	} while (tcount);
 	return 0;
+#endif
 }
 
 /* Name hashing routines. Initial hash value */

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-02 23:46 Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?) Linus Torvalds
@ 2012-03-03  0:02 ` Ted Ts'o
  2012-03-03  0:17   ` david
  2012-03-03  0:17   ` Linus Torvalds
  2012-03-03  0:38 ` H. Peter Anvin
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 26+ messages in thread
From: Ted Ts'o @ 2012-03-03  0:02 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andi Kleen, H. Peter Anvin, Linux Kernel Mailing List,
	linux-fsdevel, Al Viro

Stupid question.  Your patch requires unaligned accesses to not have a
heavy penalty, right?  Wasn't it the case that some generations of x86
had pretty large penalties for aligned accesses?  Is that something we
need to worry about?

						-  Ted

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-03  0:02 ` Ted Ts'o
@ 2012-03-03  0:17   ` david
  2012-03-03  0:24     ` Linus Torvalds
  2012-03-03  0:17   ` Linus Torvalds
  1 sibling, 1 reply; 26+ messages in thread
From: david @ 2012-03-03  0:17 UTC (permalink / raw)
  To: Ted Ts'o
  Cc: Linus Torvalds, Andi Kleen, H. Peter Anvin,
	Linux Kernel Mailing List, linux-fsdevel, Al Viro

On Fri, 2 Mar 2012, Ted Ts'o wrote:

> Stupid question.  Your patch requires unaligned accesses to not have a
> heavy penalty, right?  Wasn't it the case that some generations of x86
> had pretty large penalties for aligned accesses?  Is that something we
> need to worry about?

another stupid question

since the code that it's replaceing did byte-at-a-time access, wouldn't 
that be at least as bad as the new code?

or did some CPUs have efficient char access, but inefficient unaligned 
word access?

David Lang

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-03  0:17   ` david
@ 2012-03-03  0:24     ` Linus Torvalds
  2012-03-04 22:19       ` Matthew Wilcox
  0 siblings, 1 reply; 26+ messages in thread
From: Linus Torvalds @ 2012-03-03  0:24 UTC (permalink / raw)
  To: david
  Cc: Ted Ts'o, Andi Kleen, H. Peter Anvin,
	Linux Kernel Mailing List, linux-fsdevel, Al Viro

On Fri, Mar 2, 2012 at 4:17 PM,  <david@lang.hm> wrote:
>
> or did some CPUs have efficient char access, but inefficient unaligned word
> access?

Tons of CPU's have efficient char accesses but horrible unaligned word
accesses. Some are even outright buggy (ie at least some ARM cores)
and load crap. Others take a fault.

They just aren't x86, because x86 has traditionally had code with a
fair amount of unaligned loads and stores (and not just for historical
reasons either: even modern code replaces constant-sized memcpy() etc
with direct loads and stores)

For some other architectures, we could just use "get_unaligned()",
which fixes things up for them. I could have made that explicit, even
if it doesn't matter on x86.

So the bigger portability problem to some degree is the fact that it
is limited to little-endian, so even if you have a CPU with good
unaligned accesses (some POWER chips do ok, for example, although not
all), you'd have to also do something with the mask generation (which
currently uses the "(x-1)&~x" trick that means that it generates the
mask of the *low bits* - and then assumes that "low bits" means "first
bytes" - ie little endian).

                       Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-03  0:24     ` Linus Torvalds
@ 2012-03-04 22:19       ` Matthew Wilcox
  2012-03-04 23:27         ` Linus Torvalds
  0 siblings, 1 reply; 26+ messages in thread
From: Matthew Wilcox @ 2012-03-04 22:19 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: david, Ted Ts'o, Andi Kleen, H. Peter Anvin,
	Linux Kernel Mailing List, linux-fsdevel, Al Viro

On Fri, Mar 02, 2012 at 04:24:11PM -0800, Linus Torvalds wrote:
> Tons of CPU's have efficient char accesses but horrible unaligned word
> accesses. Some are even outright buggy (ie at least some ARM cores)
> and load crap. Others take a fault.

To be fair, that wasn't the ARM core.  That was the MEMC chip (roughly
equivalent to a northbridge).  Also, there's no need for Linux to care
about that any more, since we removed the arm26 port in July 2007.  As far
as I know, all arm32 cores have been coupled with memory controllers
that are functional.

-- 
Matthew Wilcox				Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-04 22:19       ` Matthew Wilcox
@ 2012-03-04 23:27         ` Linus Torvalds
  0 siblings, 0 replies; 26+ messages in thread
From: Linus Torvalds @ 2012-03-04 23:27 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: david, Ted Ts'o, Andi Kleen, H. Peter Anvin,
	Linux Kernel Mailing List, linux-fsdevel, Al Viro

On Sun, Mar 4, 2012 at 2:19 PM, Matthew Wilcox <matthew@wil.cx> wrote:
>
> To be fair, that wasn't the ARM core.  That was the MEMC chip (roughly
> equivalent to a northbridge).  Also, there's no need for Linux to care
> about that any more, since we removed the arm26 port in July 2007.  As far
> as I know, all arm32 cores have been coupled with memory controllers
> that are functional.

No, it wasn't just the memory controller.

The ARM cores really did some crazy stuff. Loading an unaligned
pointer resulted in loading the value from the aligned pointer, and
then doing a byte rotate on the result. And it did so silently without
a trap. So you didn't get the right value, *and* you didn't even get a
trap to tell you that the CPU couldn't do it. That's just pure
garbage.

You can call it a "feature", but sane people call it a bug. The fact
that it was *designed* to do that doesn't make it less buggy, it just
makes it sad.

Now, the modern "bigger" ARM cores don't have that problem, but the
small cores still do, afaik.

                   Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-03  0:02 ` Ted Ts'o
  2012-03-03  0:17   ` david
@ 2012-03-03  0:17   ` Linus Torvalds
  1 sibling, 0 replies; 26+ messages in thread
From: Linus Torvalds @ 2012-03-03  0:17 UTC (permalink / raw)
  To: Ted Ts'o, Linus Torvalds, Andi Kleen, H. Peter Anvin,
	Linux Kernel Mailing List, linux-fsdevel, Al Viro

On Fri, Mar 2, 2012 at 4:02 PM, Ted Ts'o <tytso@mit.edu> wrote:
> Stupid question.  Your patch requires unaligned accesses to not have a
> heavy penalty, right?  Wasn't it the case that some generations of x86
> had pretty large penalties for aligned accesses?  Is that something we
> need to worry about?

There are basically no x86's with heavy penalties.

Sure, unaligned accesses are often *slightly* more expensive,
especially if they cross the cache access boundary (which tends to be
8 bytes on older 32-bit cpu's, and generally 16 bytes on more modern
CPUs - so it's not that they are unaligned per se, but that they cross
the bank size). But even then, it's usually not a huge deal (ie it
takes up two read slots instead of just one).

There are x86 chips that are extremely bad at unaligned SSE/MMX
accesses, but not regular words.

                      Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-02 23:46 Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?) Linus Torvalds
  2012-03-03  0:02 ` Ted Ts'o
@ 2012-03-03  0:38 ` H. Peter Anvin
  2012-03-03  0:57   ` Linus Torvalds
  2012-03-03 16:12   ` Word-at-a-time dcache name accesses Andi Kleen
  2012-03-03 20:10 ` Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?) Linus Torvalds
  2012-03-05  3:58 ` Word-at-a-time dcache name accesses (was Re: .. anybody know of any " Jason Garrett-Glaser
  3 siblings, 2 replies; 26+ messages in thread
From: H. Peter Anvin @ 2012-03-03  0:38 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andi Kleen, Linux Kernel Mailing List, linux-fsdevel, Al Viro

On 03/02/2012 03:46 PM, Linus Torvalds wrote:
> 
> Comments?
> 

My biggest concern is what happens when this happens to be at the end of
mapped kernel memory and we overrun the page?

	-hpa

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-03  0:38 ` H. Peter Anvin
@ 2012-03-03  0:57   ` Linus Torvalds
  2012-03-03  1:02     ` H. Peter Anvin
  2012-03-03 16:12   ` Word-at-a-time dcache name accesses Andi Kleen
  1 sibling, 1 reply; 26+ messages in thread
From: Linus Torvalds @ 2012-03-03  0:57 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andi Kleen, Linux Kernel Mailing List, linux-fsdevel, Al Viro

On Fri, Mar 2, 2012 at 4:38 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>
> My biggest concern is what happens when this happens to be at the end of
> mapped kernel memory and we overrun the page?

Yes. It's very unlikely, and it never happens with the dentry data
itself (the name is always aligned for those).

But it *can* happen if:

 - the page contains the filename we copied from user space

 - the page is the last page mapped

 - the filename is PATH_MAX in size (or very close)

 - the last component is sufficiently unaligned

but I was thinking we'd just make sure not to free the last page, and
just solve it that way.

I was playing around with other ideas (take the page fault and fix it
up), but those are all really complicated when the notion of "don't
use the last page" is so much simpler.

                      Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-03  0:57   ` Linus Torvalds
@ 2012-03-03  1:02     ` H. Peter Anvin
  2012-03-03  1:11       ` Linus Torvalds
  0 siblings, 1 reply; 26+ messages in thread
From: H. Peter Anvin @ 2012-03-03  1:02 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andi Kleen, Linux Kernel Mailing List, linux-fsdevel, Al Viro

On 03/02/2012 04:57 PM, Linus Torvalds wrote:
> On Fri, Mar 2, 2012 at 4:38 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>>
>> My biggest concern is what happens when this happens to be at the end of
>> mapped kernel memory and we overrun the page?
> 
> Yes. It's very unlikely, and it never happens with the dentry data
> itself (the name is always aligned for those).
> 
> But it *can* happen if:
> 
>  - the page contains the filename we copied from user space
> 
>  - the page is the last page mapped
> 
>  - the filename is PATH_MAX in size (or very close)
> 
>  - the last component is sufficiently unaligned
> 
> but I was thinking we'd just make sure not to free the last page, and
> just solve it that way.
> 
> I was playing around with other ideas (take the page fault and fix it
> up), but those are all really complicated when the notion of "don't
> use the last page" is so much simpler.
> 

Note that does mean we need a guard page after each and every
discontiguous RAM range, not just the last one.  Raising that issue
since we have had serious bugs in that area in the past.

	-hpa

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-03  1:02     ` H. Peter Anvin
@ 2012-03-03  1:11       ` Linus Torvalds
  2012-03-03  1:17         ` H. Peter Anvin
  0 siblings, 1 reply; 26+ messages in thread
From: Linus Torvalds @ 2012-03-03  1:11 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Andi Kleen, Linux Kernel Mailing List, linux-fsdevel, Al Viro

On Fri, Mar 2, 2012 at 5:02 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>
> Note that does mean we need a guard page after each and every
> discontiguous RAM range, not just the last one.  Raising that issue
> since we have had serious bugs in that area in the past.

Are you sure? I didn't think we even *mapped* things at that granularity.

We only really need a guard page at the end of an actual end-of-ram
where we no longer have page tables and/or could hit device space.

Which in practice never actually is an issue on PC's - we already
guard against BIOS usage just under the 0xA0000 address, and in
practice there are always ACPI tables at the end of RAM (and on x86-32
we can't use highmem for filenames anyway, so that takes away *those*
cases).

Which is why I think that for testing purposes we don't even need to
care - it's basically a "can't happen" (not to mention that nobody
actually uses PATH_MAX pathames).

For robustness and actual deployment, I do think that yes, we do want
to make it an explicit rule.

                    Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-03  1:11       ` Linus Torvalds
@ 2012-03-03  1:17         ` H. Peter Anvin
  0 siblings, 0 replies; 26+ messages in thread
From: H. Peter Anvin @ 2012-03-03  1:17 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andi Kleen, Linux Kernel Mailing List, linux-fsdevel, Al Viro

On 03/02/2012 05:11 PM, Linus Torvalds wrote:
> On Fri, Mar 2, 2012 at 5:02 PM, H. Peter Anvin <hpa@zytor.com> wrote:
>>
>> Note that does mean we need a guard page after each and every
>> discontiguous RAM range, not just the last one.  Raising that issue
>> since we have had serious bugs in that area in the past.
> 
> Are you sure? I didn't think we even *mapped* things at that granularity.
> 
> We only really need a guard page at the end of an actual end-of-ram
> where we no longer have page tables and/or could hit device space.
> 

Yes of course.  Note that I'm currently pushing for mapping only RAM
regions; we have a lot of bugs and workarounds related to mapping too
much, and the answer to fixing that should be pretty obvious.

> For robustness and actual deployment, I do think that yes, we do want
> to make it an explicit rule.

Definitely.  Ideally those pages should be zeroed and mapped readonly.

	-hpa

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses
  2012-03-03  0:38 ` H. Peter Anvin
  2012-03-03  0:57   ` Linus Torvalds
@ 2012-03-03 16:12   ` Andi Kleen
  2012-03-03 18:47     ` H. Peter Anvin
  1 sibling, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2012-03-03 16:12 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Linus Torvalds, Linux Kernel Mailing List, linux-fsdevel, Al Viro

"H. Peter Anvin" <hpa@zytor.com> writes:

> On 03/02/2012 03:46 PM, Linus Torvalds wrote:
>> 
>> Comments?
>> 
>
> My biggest concern is what happens when this happens to be at the end of
> mapped kernel memory and we overrun the page?

A long time ago (AMD K7ish) we had hardware bugs to work around in
this area. Overlapping into some PCI hole MMIO areas could cause obscure
hangs.  That is why I was always careful to not do that in any x86
library code.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses
  2012-03-03 16:12   ` Word-at-a-time dcache name accesses Andi Kleen
@ 2012-03-03 18:47     ` H. Peter Anvin
  0 siblings, 0 replies; 26+ messages in thread
From: H. Peter Anvin @ 2012-03-03 18:47 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Linus Torvalds, Linux Kernel Mailing List, linux-fsdevel, Al Viro

On 03/03/2012 08:12 AM, Andi Kleen wrote:
> "H. Peter Anvin" <hpa@zytor.com> writes:
> 
>> On 03/02/2012 03:46 PM, Linus Torvalds wrote:
>>>
>>> Comments?
>>>
>>
>> My biggest concern is what happens when this happens to be at the end of
>> mapped kernel memory and we overrun the page?
> 
> A long time ago (AMD K7ish) we had hardware bugs to work around in
> this area. Overlapping into some PCI hole MMIO areas could cause obscure
> hangs.  That is why I was always careful to not do that in any x86
> library code.
> 

We still map too much, though.  That's a separate problems which should
be fixed in its own right, though.

	-hpa

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-02 23:46 Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?) Linus Torvalds
  2012-03-03  0:02 ` Ted Ts'o
  2012-03-03  0:38 ` H. Peter Anvin
@ 2012-03-03 20:10 ` Linus Torvalds
  2012-03-04  2:27   ` Word-at-a-time dcache name accesses (was Re: .. anybody know ofany " Tetsuo Handa
  2012-03-05  3:58 ` Word-at-a-time dcache name accesses (was Re: .. anybody know of any " Jason Garrett-Glaser
  3 siblings, 1 reply; 26+ messages in thread
From: Linus Torvalds @ 2012-03-03 20:10 UTC (permalink / raw)
  To: Andi Kleen, H. Peter Anvin
  Cc: Linux Kernel Mailing List, linux-fsdevel, Al Viro

On Fri, Mar 2, 2012 at 3:46 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> This *does* assume that "bsf" is a reasonably fast instruction, which is
> not necessarily the case especially on 32-bit x86. So the config option
> choice for this might want some tuning even on x86, but it would be lovely
> to get comments and have people test it out on older hardware.

Ok, so I was thinking about this. I can replace the "bsf" with a
multiply, and I just wonder which one is faster.

> +       /* Get the final path component length */
> +       len += __ffs(mask) >> 3;
> +
> +       /* The mask *below* the first high bit set */
> +       mask = (mask - 1) & ~mask;
> +       mask >>= 7;
> +       hash += a & mask;

So instead of the __ffs() on the original mask (to find the first byte
with the high bit set), I could use the "mask of bytes" and some math
to get the number of bytes set like this (so this goes at the end,
*after* we used the mask to mask off the bytes in 'a' - not where the
__ffs() is right now):

    /* Low bits set in each byte we used as a mask */
    mask &= ONEBYTES;

    /* Add up "mask + (mask<<8) + (mask<<16) +... ":
       same as a multiply */
    mask *= ONEBYTES;

    /* High byte now contains count of bits set */
    len += mask >> 8*(sizeof(unsigned long)-1);

which I find intriguing because it just continues with the whole
"bitmask tricks" thing and even happens to re-use one of the bitmasks
we already had.

On machines with slow bit scanning (and a good multiplier), that might
be faster.

Sadly, it's a multiply with a big constant. Yes, we could make the
constant smaller by not counting the highest byte: it is never set, so
we could use "ONEBYTES>>8" and shift right by 8*sizeof(unsigned
long)-2) instead, but it's still not as cheap as just doing adds and
masks.

I can't come up with anything really cheap to calculate "number of
bytes set". But the above may be cheaper than the bsf on some older
32-bit machines that have horrible bit scanning performance (eg Atom
or P4). An integer multiply tends to be around four cycles, the bsf
performance is all over the map (2-17 cycles latency).

                         Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know ofany filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-03 20:10 ` Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?) Linus Torvalds
@ 2012-03-04  2:27   ` Tetsuo Handa
  2012-03-04  4:31     ` Andi Kleen
  0 siblings, 1 reply; 26+ messages in thread
From: Tetsuo Handa @ 2012-03-04  2:27 UTC (permalink / raw)
  To: torvalds, andi, hpa; +Cc: linux-kernel, linux-fsdevel, viro

A passer-by's mumble.

We are discussing about systems with huge "struct dentry" instances where
selecting a hash list based on "struct qstr"->hash cannot effectively narrow
down candidates to compare, aren't we?

Then, I thought we can reconstruct candidates in a hash list.

For example, sort "struct dentry" based on return value of
strcmp(dentry1->d_name.name, dentry2->d_name.name).

For another example, embed another hash value (calculated using some algorithm
that is different from the one used for "struct qstr"->hash) into the leading 1
or 2 or 4 bytes of "struct dentry"->d_name.name (e.g.

  dentry1->d_name.name == { 0x12, 0x56, 0x34, 0x78, 'n', 'a', 'm', 'e', '1' }
  dentry2->d_name.name == { 0xab, 0xcd, 0xef, 0x01, 'n', 'a', 'm', 'e', '2' }

for 32 bits hash case, where 0x12563478 is hash value for name1 and 0xabcdef01
is hash value for name2) and compare the embedded hash (e.g.

  * (u32 *) dentry1->d_name.name == * (u32 *) dentry2->d_name.name

) before starting comparison of char array (e.g.

  memcmp(dentry1->d_name.name + 4, dentry2->d_name.name + 4, len)

).

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know ofany filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-04  2:27   ` Word-at-a-time dcache name accesses (was Re: .. anybody know ofany " Tetsuo Handa
@ 2012-03-04  4:31     ` Andi Kleen
  0 siblings, 0 replies; 26+ messages in thread
From: Andi Kleen @ 2012-03-04  4:31 UTC (permalink / raw)
  To: Tetsuo Handa; +Cc: torvalds, andi, hpa, linux-kernel, linux-fsdevel, viro

On Sun, Mar 04, 2012 at 11:27:01AM +0900, Tetsuo Handa wrote:
> A passer-by's mumble.
> 
> We are discussing about systems with huge "struct dentry" instances where
> selecting a hash list based on "struct qstr"->hash cannot effectively narrow
> down candidates to compare, aren't we?

This is effectively a simple bloom filter. 

Would be only worth it if longer hash chains are common. And if that's the case
we should probably have either a larger table or a better hash (or both)

-Andi

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-02 23:46 Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?) Linus Torvalds
                   ` (2 preceding siblings ...)
  2012-03-03 20:10 ` Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?) Linus Torvalds
@ 2012-03-05  3:58 ` Jason Garrett-Glaser
  2012-03-05  5:38   ` Linus Torvalds
  3 siblings, 1 reply; 26+ messages in thread
From: Jason Garrett-Glaser @ 2012-03-05  3:58 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Andi Kleen, H. Peter Anvin, Linux Kernel Mailing List,
	linux-fsdevel, Al Viro

On Fri, Mar 2, 2012 at 3:46 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Here's a new version of that patch.
>
> It's based n the cleanups I committed and pushed out today, so the base
> may not look familiar, but the upside is that this does the configuration
> automatically (currently the patch enables the word accesses on x86 by
> default as long as DEBUG_PAGEALLOC isn't set).
>
> I worked around my problems with stupid branch prediction on the '/' test
> at the end by just reorganizing the code a bit, and it actually all just
> looks cleaner.
>
> This *does* assume that "bsf" is a reasonably fast instruction, which is
> not necessarily the case especially on 32-bit x86. So the config option
> choice for this might want some tuning even on x86, but it would be lovely
> to get comments and have people test it out on older hardware.

There is an improvement you can make to this.  "bsf" is microcoded in
many future CPUs (e.g. Piledriver) in favor of tzcnt, which has
slightly different flag behavior and no undefined behavior and is part
of BMI1.  This costs a few clocks in such chips -- not as bad as the
Really Slow bsf in chips like the Pentium 4 and Atom, but more than is
necessary (and stalls the instruction decoder, at least on AMD, while
the microcode unit runs).

However, "tzcnt" is opcode-equivalent to "rep bsf", for some odd sort
of backwards compatibility.  Therefore, if your code does the same
thing with both tzcnt and bsf, you can simply use rep bsf instead, and
it'll work without any CPU checks.  The manuals claim that this works
on legacy CPUs, i.e. won't SIGILL.

The only downside I've noticed to this is that valgrind incorrectly
SIGILLs on rep bsf.

Jason

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-05  3:58 ` Word-at-a-time dcache name accesses (was Re: .. anybody know of any " Jason Garrett-Glaser
@ 2012-03-05  5:38   ` Linus Torvalds
  2012-03-27  4:42     ` Brian Gerst
  0 siblings, 1 reply; 26+ messages in thread
From: Linus Torvalds @ 2012-03-05  5:38 UTC (permalink / raw)
  To: Jason Garrett-Glaser
  Cc: Andi Kleen, H. Peter Anvin, Linux Kernel Mailing List,
	linux-fsdevel, Al Viro

[-- Attachment #1: Type: text/plain, Size: 2241 bytes --]

On Sun, Mar 4, 2012 at 7:58 PM, Jason Garrett-Glaser <jason@x264.com> wrote:
>
> There is an improvement you can make to this.  "bsf" is microcoded in
> many future CPUs (e.g. Piledriver) in favor of tzcnt, which has
> slightly different flag behavior and no undefined behavior and is part
> of BMI1.

So I've gotten rid of 'bsf' because it really does have problems on
many CPU's. It's disgustingly slow on some older CPU's.

I asked around on G+ to see if that would be useful, and there's a
nice simple four-instruction sequence for the 32-bit case using just
trivial operations (one shift, one and, a couple of adds).

For the 64-bit case, the bsf can be replaced with a single multiply
and shift. The bsf is still better on some CPU's, but the single
multiply and shift is more consistently good - and as long as it
doesn't stall the CPU, we're good, because the end result of it all
won't be used until several cycles later.

So my current patch is attached - it does depend on the current -git
tree having moved dentry_cmp() into fs/dcache.c, so it's on top of
*tonights* -git tree, but this is something I'm pretty happy with, and
was planning on actually committing early in the 3.4 merge window.

My profiling seems to show that the multiply is pretty much free on
64-bit at least on the cpu's I have access to - it's not like a
multiply is free, but I do suspect it gets hidden very well by any OoO
instruction scheduling.

A bit-count instructions (popcount or bsf or tzcnt is obviously in
*theory* less work than a 64-bit multiply, but the multiply is
"portable". Even if it isn't optimal, it shouldn't be horrible on any
64-bit capable x86 CPU, and it also means (for example) that the code
might even work on non-x86 chips.

I did only very limited profiling of the 32-bit case, but it's really
just four cheap ALU instructions there and didn't really show up at
all in the limited profiles I did. And at least I checked that the
code worked. I have to say that the advantage of "vectorizing" this
code is obviously much less if you can only do 4-byte "vectors", so I
didn't actually time whether the patch *improves* anything on x86-32.

                                     Linus

[-- Attachment #2: patch.diff --]
[-- Type: text/x-patch, Size: 5700 bytes --]

 arch/x86/Kconfig |    1 +
 fs/Kconfig       |    4 ++
 fs/dcache.c      |   23 ++++++++++
 fs/namei.c       |  129 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 157 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5bed94e189fa..09675d3e0ac3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -82,6 +82,7 @@ config X86
 	select CLKEVT_I8253
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select GENERIC_IOMAP
+	select DCACHE_WORD_ACCESS if !DEBUG_PAGEALLOC
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
diff --git a/fs/Kconfig b/fs/Kconfig
index d621f02a3f9e..aa195265362f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -4,6 +4,10 @@
 
 menu "File systems"
 
+# Use unaligned word dcache accesses
+config DCACHE_WORD_ACCESS
+       bool
+
 if BLOCK
 
 source "fs/ext2/Kconfig"
diff --git a/fs/dcache.c b/fs/dcache.c
index bcbdb33fcc20..ffd47a16d870 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -144,6 +144,28 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
 static inline int dentry_cmp(const unsigned char *cs, size_t scount,
 				const unsigned char *ct, size_t tcount)
 {
+#ifdef CONFIG_DCACHE_WORD_ACCESS
+	unsigned long a,b,mask;
+
+	if (unlikely(scount != tcount))
+		return 1;
+
+	for (;;) {
+		a = *(unsigned long *)cs;
+		b = *(unsigned long *)ct;
+		if (tcount < sizeof(unsigned long))
+			break;
+		if (unlikely(a != b))
+			return 1;
+		cs += sizeof(unsigned long);
+		ct += sizeof(unsigned long);
+		tcount -= sizeof(unsigned long);
+		if (!tcount)
+			return 0;
+	}
+	mask = ~(~0ul << tcount*8);
+	return unlikely(!!((a ^ b) & mask));
+#else
 	if (scount != tcount)
 		return 1;
 
@@ -155,6 +177,7 @@ static inline int dentry_cmp(const unsigned char *cs, size_t scount,
 		tcount--;
 	} while (tcount);
 	return 0;
+#endif
 }
 
 static void __d_free(struct rcu_head *head)
diff --git a/fs/namei.c b/fs/namei.c
index e2ba62820a0f..556778cd8b87 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1374,6 +1374,133 @@ static inline int can_lookup(struct inode *inode)
 	return 1;
 }
 
+/*
+ * We can do the critical dentry name comparison and hashing
+ * operations one word at a time, but we are limited to:
+ *
+ * - Architectures with fast unaligned word accesses. We could
+ *   do a "get_unaligned()" if this helps and is sufficiently
+ *   fast.
+ *
+ * - Little-endian machines (so that we can generate the mask
+ *   of low bytes efficiently). Again, we *could* do a byte
+ *   swapping load on big-endian architectures if that is not
+ *   expensive enough to make the optimization worthless.
+ *
+ * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
+ *   do not trap on the (extremely unlikely) case of a page
+ *   crossing operation.
+ *
+ * - Furthermore, we need an efficient 64-bit compile for the
+ *   64-bit case in order to generate the "number of bytes in
+ *   the final mask". Again, that could be replaced with a
+ *   efficient population count instruction or similar.
+ */
+#ifdef CONFIG_DCACHE_WORD_ACCESS
+
+#ifdef CONFIG_64BIT
+
+/*
+ * Jan Achrenius on G+: microoptimized version of
+ * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56"
+ * that works for the bytemasks without having to
+ * mask them first.
+ */
+static inline long count_masked_bytes(unsigned long mask)
+{
+	return mask*0x0001020304050608 >> 56;
+}
+
+static inline unsigned int fold_hash(unsigned long hash)
+{
+	hash += hash >> (8*sizeof(int));
+	return hash;
+}
+
+#else	/* 32-bit case */
+
+/* Modified Carl Chatfield G+ version for 32-bit */
+static inline long count_masked_bytes(long mask)
+{
+	/*
+	 * (a) gives us
+	 *   -1 (0, ff), 0 (ffff) or 1 (ffffff)
+	 * (b) gives us
+	 *   0 for 0, 1 for (ff ffff ffffff)
+	 * (a+b+1) gives us
+	 *   correct 0-3 bytemask count result
+	 */
+	long a = (mask-256) >> 23;
+	long b = mask & 1;
+	return a + b + 1;
+}
+
+#define fold_hash(x) (x)
+
+#endif
+
+unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+{
+	unsigned long a, mask;
+	unsigned long hash = 0;
+
+	for (;;) {
+		a = *(unsigned long *)name;
+		hash *= 9;
+		if (len < sizeof(unsigned long))
+			break;
+		hash += a;
+		name += sizeof(unsigned long);
+		len -= sizeof(unsigned long);
+		if (!len)
+			goto done;
+	}
+	mask = ~(~0ul << len*8);
+	hash += mask & a;
+done:
+	return fold_hash(hash);
+}
+EXPORT_SYMBOL(full_name_hash);
+
+#define ONEBYTES	0x0101010101010101ul
+#define SLASHBYTES	0x2f2f2f2f2f2f2f2ful
+#define HIGHBITS	0x8080808080808080ul
+
+/* Return the high bit set in the first byte that is a zero */
+static inline unsigned long has_zero(unsigned long a)
+{
+	return ((a - ONEBYTES) & ~a) & HIGHBITS;
+}
+
+/*
+ * Calculate the length and hash of the path component, and
+ * return the length of the component;
+ */
+static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+{
+	unsigned long a, mask, hash, len;
+
+	hash = a = 0;
+	len = -sizeof(unsigned long);
+	do {
+		hash = (hash + a) * 9;
+		len += sizeof(unsigned long);
+		a = *(unsigned long *)(name+len);
+		/* Do we have any NUL or '/' bytes in this word? */
+		mask = has_zero(a) | has_zero(a ^ SLASHBYTES);
+	} while (!mask);
+
+	/* The mask *below* the first high bit set */
+	mask = (mask - 1) & ~mask;
+	mask >>= 7;
+	hash += a & mask;
+	*hashp = fold_hash(hash);
+
+	return len + count_masked_bytes(mask);
+}
+
+#else
+
 unsigned int full_name_hash(const unsigned char *name, unsigned int len)
 {
 	unsigned long hash = init_name_hash();
@@ -1402,6 +1529,8 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
 	return len;
 }
 
+#endif
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-05  5:38   ` Linus Torvalds
@ 2012-03-27  4:42     ` Brian Gerst
  2012-03-27  5:02       ` Dave Jones
  0 siblings, 1 reply; 26+ messages in thread
From: Brian Gerst @ 2012-03-27  4:42 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jason Garrett-Glaser, Andi Kleen, H. Peter Anvin,
	Linux Kernel Mailing List, linux-fsdevel, Al Viro

On Mon, Mar 5, 2012 at 12:38 AM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Sun, Mar 4, 2012 at 7:58 PM, Jason Garrett-Glaser <jason@x264.com> wrote:
>>
>> There is an improvement you can make to this.  "bsf" is microcoded in
>> many future CPUs (e.g. Piledriver) in favor of tzcnt, which has
>> slightly different flag behavior and no undefined behavior and is part
>> of BMI1.
>
> So I've gotten rid of 'bsf' because it really does have problems on
> many CPU's. It's disgustingly slow on some older CPU's.
>
> I asked around on G+ to see if that would be useful, and there's a
> nice simple four-instruction sequence for the 32-bit case using just
> trivial operations (one shift, one and, a couple of adds).
>
> For the 64-bit case, the bsf can be replaced with a single multiply
> and shift. The bsf is still better on some CPU's, but the single
> multiply and shift is more consistently good - and as long as it
> doesn't stall the CPU, we're good, because the end result of it all
> won't be used until several cycles later.
>
> So my current patch is attached - it does depend on the current -git
> tree having moved dentry_cmp() into fs/dcache.c, so it's on top of
> *tonights* -git tree, but this is something I'm pretty happy with, and
> was planning on actually committing early in the 3.4 merge window.
>
> My profiling seems to show that the multiply is pretty much free on
> 64-bit at least on the cpu's I have access to - it's not like a
> multiply is free, but I do suspect it gets hidden very well by any OoO
> instruction scheduling.
>
> A bit-count instructions (popcount or bsf or tzcnt is obviously in
> *theory* less work than a 64-bit multiply, but the multiply is
> "portable". Even if it isn't optimal, it shouldn't be horrible on any
> 64-bit capable x86 CPU, and it also means (for example) that the code
> might even work on non-x86 chips.
>
> I did only very limited profiling of the 32-bit case, but it's really
> just four cheap ALU instructions there and didn't really show up at
> all in the limited profiles I did. And at least I checked that the
> code worked. I have to say that the advantage of "vectorizing" this
> code is obviously much less if you can only do 4-byte "vectors", so I
> didn't actually time whether the patch *improves* anything on x86-32.
>
>                                     Linus

This patch is causing my system (x86-64, Fedora 16) to fail to boot
when DEBUG_PAGEALLOC=n.  No oops, but these error messages were in the
log for the bad kernel:

type=1400 audit(1332802076.643:4): avc:  denied  { dyntransition } for
 pid=1 comm="systemd" scontext=system_u:system_r:kernel_t:s0
tcontext=system_u:object_r:init_exec_t:s0 tclass=process
systemd[1]: Failed to transition into init label
'system_u:object_r:init_exec_t:s0', ignoring.
type=1400 audit(1332816477.781:5): avc:  denied  { create } for  pid=1
comm="systemd" scontext=system_u:system_r:kernel_t:s0
tcontext=system_u:object_r:init_exec_t:s0 tclass=unix_dgram_socket
systemd[1]: systemd-shutdownd.socket failed to listen on sockets:
Permission denied
systemd[1]: Unit systemd-shutdownd.socket entered failed state.
type=1400 audit(1332816477.782:6): avc:  denied  { create } for  pid=1
comm="systemd" scontext=system_u:system_r:kernel_t:s0
tcontext=system_u:object_r:syslogd_exec_t:s0 tclass=unix_dgram_socket
systemd[1]: syslog.socket failed to listen on sockets: Permission denied
systemd[1]: Unit syslog.socket entered failed state.
systemd-kmsg-syslogd[457]: No or too many file descriptors passed.
type=1400 audit(1332816477.847:7): avc:  denied  { create } for  pid=1
comm="systemd" scontext=system_u:system_r:kernel_t:s0
tcontext=system_u:object_r:udev_exec_t:s0
tclass=netlink_kobject_uevent_socket
systemd[1]: udev-kernel.socket failed to listen on sockets: Permission denied
systemd[1]: Unit udev-kernel.socket entered failed state.
type=1400 audit(1332816477.848:8): avc:  denied  { create } for  pid=1
comm="systemd" scontext=system_u:system_r:kernel_t:s0
tcontext=system_u:object_r:udev_exec_t:s0 tclass=unix_stream_socket
systemd[1]: udev-control.socket failed to listen on sockets: Permission denied
systemd[1]: Unit udev-control.socket entered failed state.
type=1400 audit(1332816477.848:9): avc:  denied  { create } for  pid=1
comm="systemd" scontext=system_u:system_r:kernel_t:s0
tcontext=system_u:object_r:init_exec_t:s0 tclass=unix_stream_socket
systemd[1]: systemd-stdout-syslog-bridge.socket failed to listen on
sockets: Permission denied

--
Brian Gerst
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-27  4:42     ` Brian Gerst
@ 2012-03-27  5:02       ` Dave Jones
  2012-03-27  5:31         ` Brian Gerst
  0 siblings, 1 reply; 26+ messages in thread
From: Dave Jones @ 2012-03-27  5:02 UTC (permalink / raw)
  To: Brian Gerst
  Cc: Linus Torvalds, Jason Garrett-Glaser, Andi Kleen, H. Peter Anvin,
	Linux Kernel Mailing List, linux-fsdevel, Al Viro

On Tue, Mar 27, 2012 at 12:42:19AM -0400, Brian Gerst wrote:

 > > I did only very limited profiling of the 32-bit case, but it's really
 > > just four cheap ALU instructions there and didn't really show up at
 > > all in the limited profiles I did. And at least I checked that the
 > > code worked. I have to say that the advantage of "vectorizing" this
 > > code is obviously much less if you can only do 4-byte "vectors", so I
 > > didn't actually time whether the patch *improves* anything on x86-32.
 > >
 > 
 > This patch is causing my system (x86-64, Fedora 16) to fail to boot
 > when DEBUG_PAGEALLOC=n.  No oops, but these error messages were in the
 > log for the bad kernel:

I saw this too, it was fixed by f132c5be05e407a99cf582347a2ae0120acd3ad7 for me.
Does your kernel have that commit ?

	Dave



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-27  5:02       ` Dave Jones
@ 2012-03-27  5:31         ` Brian Gerst
  2012-03-28  0:39           ` Linus Torvalds
  0 siblings, 1 reply; 26+ messages in thread
From: Brian Gerst @ 2012-03-27  5:31 UTC (permalink / raw)
  To: Dave Jones, Brian Gerst, Linus Torvalds, Jason Garrett-Glaser,
	Andi Kleen, H. Peter Anvin, Linux Kernel Mailing List,
	linux-fsdevel, Al Viro

On Tue, Mar 27, 2012 at 1:02 AM, Dave Jones <davej@redhat.com> wrote:
> On Tue, Mar 27, 2012 at 12:42:19AM -0400, Brian Gerst wrote:
>
>  > > I did only very limited profiling of the 32-bit case, but it's really
>  > > just four cheap ALU instructions there and didn't really show up at
>  > > all in the limited profiles I did. And at least I checked that the
>  > > code worked. I have to say that the advantage of "vectorizing" this
>  > > code is obviously much less if you can only do 4-byte "vectors", so I
>  > > didn't actually time whether the patch *improves* anything on x86-32.
>  > >
>  >
>  > This patch is causing my system (x86-64, Fedora 16) to fail to boot
>  > when DEBUG_PAGEALLOC=n.  No oops, but these error messages were in the
>  > log for the bad kernel:
>
> I saw this too, it was fixed by f132c5be05e407a99cf582347a2ae0120acd3ad7 for me.
> Does your kernel have that commit ?
>
>        Dave
>
>

Yes, I see that commit.  It looks like I have a different bug that is
preventing the current head from booting then.  Bisecting brought me
to this commit instead.

--
Brian Gerst

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-27  5:31         ` Brian Gerst
@ 2012-03-28  0:39           ` Linus Torvalds
  2012-03-28  0:50             ` Linus Torvalds
  2012-03-28  0:56             ` Brian Gerst
  0 siblings, 2 replies; 26+ messages in thread
From: Linus Torvalds @ 2012-03-28  0:39 UTC (permalink / raw)
  To: Brian Gerst
  Cc: Dave Jones, Jason Garrett-Glaser, Andi Kleen, H. Peter Anvin,
	Linux Kernel Mailing List, linux-fsdevel, Al Viro

On Mon, Mar 26, 2012 at 10:31 PM, Brian Gerst <brgerst@gmail.com> wrote:
>
> Yes, I see that commit.  It looks like I have a different bug that is
> preventing the current head from booting then.  Bisecting brought me
> to this commit instead.

So try first just forcing DCACHE_WORD_ACCESS off (easily done by just
removing the "select" line in arch/x86/Kconfig).

If that fixes things, it really is the word access code.

But if the problem persists and it's a different bug, try to bisect it
with that DCACHE_WORD_ACCESS config option always forced off, just to
avoid any mixing of issues, exactly because we did have that one bug
that already got fixed that looked somewhat similar..

I'll test the current -git kernel on a F16 machine I have, just to
check. But I'm also assuming that there is something specific about
your setup, because otherwise I'd have expected many  more reports
about this...

                     Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-28  0:39           ` Linus Torvalds
@ 2012-03-28  0:50             ` Linus Torvalds
  2012-03-28  0:56             ` Brian Gerst
  1 sibling, 0 replies; 26+ messages in thread
From: Linus Torvalds @ 2012-03-28  0:50 UTC (permalink / raw)
  To: Brian Gerst
  Cc: Dave Jones, Jason Garrett-Glaser, Andi Kleen, H. Peter Anvin,
	Linux Kernel Mailing List, linux-fsdevel, Al Viro

On Tue, Mar 27, 2012 at 5:39 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> I'll test the current -git kernel on a F16 machine I have, just to
> check. But I'm also assuming that there is something specific about
> your setup, because otherwise I'd have expected many  more reports
> about this...

Confirmed. It's not something generic to F16, things work normally
here on the x86-64 machine I just checked on.

I wonder if maybe there's a testing snafu, and commit f132c5be05e4
wasn't in the tree you tested. So please do double-check.

                        Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
  2012-03-28  0:39           ` Linus Torvalds
  2012-03-28  0:50             ` Linus Torvalds
@ 2012-03-28  0:56             ` Brian Gerst
       [not found]               ` <CACvQF53YasSCUit2KoWDimgObknCz++aU90MesSfvAZTeUFQHw@mail.gmail.com>
  1 sibling, 1 reply; 26+ messages in thread
From: Brian Gerst @ 2012-03-28  0:56 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Dave Jones, Jason Garrett-Glaser, Andi Kleen, H. Peter Anvin,
	Linux Kernel Mailing List, linux-fsdevel, Al Viro

On Tue, Mar 27, 2012 at 8:39 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> On Mon, Mar 26, 2012 at 10:31 PM, Brian Gerst <brgerst@gmail.com> wrote:
>>
>> Yes, I see that commit.  It looks like I have a different bug that is
>> preventing the current head from booting then.  Bisecting brought me
>> to this commit instead.
>
> So try first just forcing DCACHE_WORD_ACCESS off (easily done by just
> removing the "select" line in arch/x86/Kconfig).
>
> If that fixes things, it really is the word access code.
>
> But if the problem persists and it's a different bug, try to bisect it
> with that DCACHE_WORD_ACCESS config option always forced off, just to
> avoid any mixing of issues, exactly because we did have that one bug
> that already got fixed that looked somewhat similar..
>
> I'll test the current -git kernel on a F16 machine I have, just to
> check. But I'm also assuming that there is something specific about
> your setup, because otherwise I'd have expected many  more reports
> about this...
>
>                     Linus

Commit f132c5be05e40 did fix this particular problem for me.  There is
another bug causing boot failure (seems to be in the DRM code), but
bisecting initially fingered this patch.  It was just looking at
kernels in between this patch and the later fix.  Sorry for the
confusion.

--
Brian Gerst

^ permalink raw reply	[flat|nested] 26+ messages in thread

[parent not found: <CACvQF53YasSCUit2KoWDimgObknCz++aU90MesSfvAZTeUFQHw@mail.gmail.com>]

* Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)
       [not found]               ` <CACvQF53YasSCUit2KoWDimgObknCz++aU90MesSfvAZTeUFQHw@mail.gmail.com>
@ 2013-04-04 16:50                 ` Lai Jiangshan
  0 siblings, 0 replies; 26+ messages in thread
From: Lai Jiangshan @ 2013-04-04 16:50 UTC (permalink / raw)
  To: Brian Gerst
  Cc: Linus Torvalds, Dave Jones, Jason Garrett-Glaser, Andi Kleen,
	H. Peter Anvin, Linux Kernel Mailing List, linux-fsdevel, Al Viro,
	dingtianhong, Karel Srot, Eric Dumazet, David S. Miller

[resend in plain text mode (I did not notice the gmail changed the
default mode, sorry)]

On Fri, Apr 5, 2013 at 12:17 AM, Lai Jiangshan <laijs@cn.fujitsu.com> wrote:
> Hi, ALL
>
> I also encountered the same problem.
>
> git bisect:
>
> 14134f6584212d585b310ce95428014b653dfaf6 is the first bad commit
> commit 14134f6584212d585b310ce95428014b653dfaf6
> Author: dingtianhong <dingtianhong@huawei.com>
> Date:   Mon Mar 25 17:02:04 2013 +0000
>
>     af_unix: dont send SCM_CREDENTIAL when dest socket is NULL
>
>     SCM_SCREDENTIALS should apply to write() syscalls only either source or
> destination
>     socket asserted SOCK_PASSCRED. The original implememtation in
> maybe_add_creds is wrong,
>     and breaks several LSB testcases ( i.e.
> /tset/LSB.os/netowkr/recvfrom/T.recvfrom).
>
>     Origionally-authored-by: Karel Srot <ksrot@redhat.com>
>     Signed-off-by: Ding Tianhong <dingtianhong@huawei.com>
>     Acked-by: Eric Dumazet <edumazet@google.com>
>     Signed-off-by: David S. Miller <davem@davemloft.net>
>
> :040000 040000 ef0356cc0fc168a39c0f94cff0ba27c46c4d0048
> ae34e59f235c379f04d6145f0103cccd5b3a307a M net
>
> ===========
> Like Brian Gerst, no obvious bug, but the system can't boot, "service udev
> start" fails when boot
> (also DEBUG_PAGEALLOC=n, I did not try to test with it=y)
>
> [   11.022976] systemd[1]: udev-control.socket failed to listen on sockets:
> Address already in use
> [   11.023293] systemd[1]: Unit udev-control.socket entered failed state.
> [   11.182478] systemd-readahead-replay[399]: Bumped block_nr parameter of
> 8:16 to 16384. This is a temporary hack and should be removed one day.
> [   14.473283] udevd[410]: bind failed: Address already in use
> [   14.478630] udevd[410]: error binding udev control socket
> [   15.201158] systemd[1]: udev.service: main process exited, code=exited,
> status=1
> [   16.900792] udevd[427]: error binding udev control socket
> [   18.356484] EXT4-fs (sdb7): re-mounted. Opts: (null)
> [   19.738401] systemd[1]: udev.service holdoff time over, scheduling
> restart.
> [   19.742494] systemd[1]: Job pending for unit, delaying automatic restart.
> [   19.747764] systemd[1]: Unit udev.service entered failed state.
> [   19.752303] systemd[1]: udev-control.socket failed to listen on sockets:
> Address already in use
> [   19.770723] udevd[459]: bind failed: Address already in use
> [   19.771027] udevd[459]: error binding udev control socket
> [   19.771175] udevd[459]: error binding udev control socket
> [   19.813256] systemd[1]: udev.service: main process exited, code=exited,
> status=1
> [   19.914450] systemd[1]: udev.service holdoff time over, scheduling
> restart.
> [   19.918374] systemd[1]: Job pending for unit, delaying automatic restart.
> [   19.923392] systemd[1]: Unit udev.service entered failed state.
> [   19.923808] systemd[1]: udev-control.socket failed to listen on sockets:
> Address already in use
> [   19.943792] udevd[465]: bind failed: Address already in use
> [   19.944056] udevd[465]: error binding udev control socket
> [   19.944210] udevd[465]: error binding udev control socket
> [   19.946071] systemd[1]: udev.service: main process exited, code=exited,
> status=1
> [   20.047524] systemd[1]: udev.service holdoff time over, scheduling
> restart.
> [   20.051939] systemd[1]: Job pending for unit, delaying automatic restart.
> [   20.057539] systemd[1]: Unit udev.service entered failed state.
> [   20.058069] systemd[1]: udev-control.socket failed to listen on sockets:
> Address already in use
> [   20.081141] udevd[467]: bind failed: Address already in use
> [   20.087120] udevd[467]: error binding udev control socket
> [   20.092040] udevd[467]: error binding udev control socket
> [   20.096519] systemd[1]: udev.service: main process exited, code=exited,
> status=1
> [   20.184910] systemd[1]: udev.service holdoff time over, scheduling
> restart.
> [   20.189863] systemd[1]: Job pending for unit, delaying automatic restart.
> [   20.195440] systemd[1]: Unit udev.service entered failed state.
> [   20.196012] systemd[1]: udev-control.socket failed to listen on sockets:
> Address already in use
> [   20.220543] udevd[469]: bind failed: Address already in use
> [   20.220584] udevd[469]: error binding udev control socket
> [   20.220780] udevd[469]: error binding udev control socket
> [   20.222830] systemd[1]: udev.service: main process exited, code=exited,
> status=1
> [   20.323906] systemd[1]: udev.service holdoff time over, scheduling
> restart.
> [   20.329170] systemd[1]: Job pending for unit, delaying automatic restart.
> [   20.334785] systemd[1]: Unit udev.service entered failed state.
> [   20.335318] systemd[1]: udev-control.socket failed to listen on sockets:
> Address already in use
> [   20.360255] udevd[471]: bind failed: Address already in use
> [   20.360294] udevd[471]: error binding udev control socket
> [   20.360401] udevd[471]: error binding udev control socket
> [   20.362359] systemd[1]: udev.service: main process exited, code=exited,
> status=1
> [   20.463651] systemd[1]: udev.service holdoff time over, scheduling
> restart.
> [   20.468380] systemd[1]: Job pending for unit, delaying automatic restart.
> [   20.473938] systemd[1]: Unit udev.service entered failed state.
> [   20.474309] systemd[1]: udev-control.socket failed to listen on sockets:
> Address already in use
> [   20.488704] systemd[1]: udev.service start request repeated too quickly,
> refusing to start.
> [   20.504820] systemd[1]: udev-control.socket failed to listen on sockets:
> Address already in use
> [   20.510091] systemd[1]: udev.service start request repeated too quickly,
> refusing to start.
> [   20.511575] systemd[1]: Unit udev-kernel.socket entered failed state.
>
> On Wed, Mar 28, 2012 at 8:56 AM, Brian Gerst <brgerst@gmail.com> wrote:
>>
>> On Tue, Mar 27, 2012 at 8:39 PM, Linus Torvalds
>> <torvalds@linux-foundation.org> wrote:
>> > On Mon, Mar 26, 2012 at 10:31 PM, Brian Gerst <brgerst@gmail.com> wrote:
>> >>
>> >> Yes, I see that commit.  It looks like I have a different bug that is
>> >> preventing the current head from booting then.  Bisecting brought me
>> >> to this commit instead.
>> >
>> > So try first just forcing DCACHE_WORD_ACCESS off (easily done by just
>> > removing the "select" line in arch/x86/Kconfig).
>> >
>> > If that fixes things, it really is the word access code.
>> >
>> > But if the problem persists and it's a different bug, try to bisect it
>> > with that DCACHE_WORD_ACCESS config option always forced off, just to
>> > avoid any mixing of issues, exactly because we did have that one bug
>> > that already got fixed that looked somewhat similar..
>> >
>> > I'll test the current -git kernel on a F16 machine I have, just to
>> > check. But I'm also assuming that there is something specific about
>> > your setup, because otherwise I'd have expected many  more reports
>> > about this...
>> >
>> >                     Linus
>>
>> Commit f132c5be05e40 did fix this particular problem for me.  There is
>> another bug causing boot failure (seems to be in the DRM code), but
>> bisecting initially fingered this patch.  It was just looking at
>> kernels in between this patch and the later fix.  Sorry for the
>> confusion.
>>
>> --
>> Brian Gerst
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>
>

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2013-04-04 16:50 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-03-02 23:46 Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?) Linus Torvalds
2012-03-03  0:02 ` Ted Ts'o
2012-03-03  0:17   ` david
2012-03-03  0:24     ` Linus Torvalds
2012-03-04 22:19       ` Matthew Wilcox
2012-03-04 23:27         ` Linus Torvalds
2012-03-03  0:17   ` Linus Torvalds
2012-03-03  0:38 ` H. Peter Anvin
2012-03-03  0:57   ` Linus Torvalds
2012-03-03  1:02     ` H. Peter Anvin
2012-03-03  1:11       ` Linus Torvalds
2012-03-03  1:17         ` H. Peter Anvin
2012-03-03 16:12   ` Word-at-a-time dcache name accesses Andi Kleen
2012-03-03 18:47     ` H. Peter Anvin
2012-03-03 20:10 ` Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?) Linus Torvalds
2012-03-04  2:27   ` Word-at-a-time dcache name accesses (was Re: .. anybody know ofany " Tetsuo Handa
2012-03-04  4:31     ` Andi Kleen
2012-03-05  3:58 ` Word-at-a-time dcache name accesses (was Re: .. anybody know of any " Jason Garrett-Glaser
2012-03-05  5:38   ` Linus Torvalds
2012-03-27  4:42     ` Brian Gerst
2012-03-27  5:02       ` Dave Jones
2012-03-27  5:31         ` Brian Gerst
2012-03-28  0:39           ` Linus Torvalds
2012-03-28  0:50             ` Linus Torvalds
2012-03-28  0:56             ` Brian Gerst
     [not found]               ` <CACvQF53YasSCUit2KoWDimgObknCz++aU90MesSfvAZTeUFQHw@mail.gmail.com>
2013-04-04 16:50                 ` Lai Jiangshan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).