* [PATCH 1/2] x86/bitops: implement __test_bit
@ 2015-08-30 8:38 Michael S. Tsirkin
2015-08-30 8:38 ` [PATCH 2/2] kvm/x86: use __test_bit Michael S. Tsirkin
2015-08-31 6:05 ` [PATCH 1/2] x86/bitops: implement __test_bit Ingo Molnar
0 siblings, 2 replies; 15+ messages in thread
From: Michael S. Tsirkin @ 2015-08-30 8:38 UTC (permalink / raw)
To: linux-kernel
Cc: Thomas Gleixner, Ingo Molnar, H. Peter Anvin, x86, Rusty Russell
One little known side effect of test_bit is that it adds
a kind of a compiler barrier since the pointer parameter
is volatile.
It seems risky to change the semantics of test_bit so let's just
add __test_bit (matching __set_bit and __clear_bit) that does
not add such a barrier.
Will be used by kvm on x86, where it shaves several bytes off
the binary size. Small win, but comes at no cost, so why not.
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
x86 maintainers - please specify whether you are ok with
adding this to arch/x86/include/asm/bitops.h
An alternative is to add this to kvm/x86 only.
It might be worth it to add this to all architectures,
though I haven't explored too much.
arch/x86/include/asm/bitops.h | 30 ++++++++++++++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index cfe3b95..9229334 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -323,6 +323,24 @@ static inline int variable_test_bit(long nr, volatile const unsigned long *addr)
return oldbit;
}
+static __always_inline int __constant_test_bit(long nr, const unsigned long *addr)
+{
+ return ((1UL << (nr & (BITS_PER_LONG-1))) &
+ (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
+}
+
+static inline int __variable_test_bit(long nr, const unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile("bt %2,%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit)
+ : "m" (*addr), "Ir" (nr));
+
+ return oldbit;
+}
+
#if 0 /* Fool kernel-doc since it doesn't do macros yet */
/**
* test_bit - Determine whether a bit is set
@@ -330,6 +348,13 @@ static inline int variable_test_bit(long nr, volatile const unsigned long *addr)
* @addr: Address to start counting from
*/
static int test_bit(int nr, const volatile unsigned long *addr);
+
+/**
+ * __test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static int __test_bit(int nr, const volatile unsigned long *addr);
#endif
#define test_bit(nr, addr) \
@@ -337,6 +362,11 @@ static int test_bit(int nr, const volatile unsigned long *addr);
? constant_test_bit((nr), (addr)) \
: variable_test_bit((nr), (addr)))
+#define __test_bit(nr, addr) \
+ (__builtin_constant_p((nr)) \
+ ? __constant_test_bit((nr), (addr)) \
+ : __variable_test_bit((nr), (addr)))
+
/**
* __ffs - find first set bit in word
* @word: The word to search
--
MST
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 2/2] kvm/x86: use __test_bit
2015-08-30 8:38 [PATCH 1/2] x86/bitops: implement __test_bit Michael S. Tsirkin
@ 2015-08-30 8:38 ` Michael S. Tsirkin
2015-08-31 6:05 ` [PATCH 1/2] x86/bitops: implement __test_bit Ingo Molnar
1 sibling, 0 replies; 15+ messages in thread
From: Michael S. Tsirkin @ 2015-08-30 8:38 UTC (permalink / raw)
To: linux-kernel
Cc: Thomas Gleixner, Ingo Molnar, H. Peter Anvin, x86, Gleb Natapov,
Paolo Bonzini, kvm, Rusty Russell
Let compiler do slightly better optimizations using
the non-volatile __test_bit in all cases where
the values are set using the non-volatile __set_bit and
__clear_bit.
I left test_bit in place where the mask is set using
the atomic set_bit/clear_bit, for symmetry.
This shaves about 100 bytes off the kernel size:
before:
134868 2997 8372 146237 23b3d arch/x86/kvm/kvm-intel.ko
343129 47640 441 391210 5f82a arch/x86/kvm/kvm.ko
after:
134836 2997 8372 146205 23b1d arch/x86/kvm/kvm-intel.ko
343017 47640 441 391098 5f7ba arch/x86/kvm/kvm.ko
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
arch/x86/kvm/ioapic.h | 2 +-
arch/x86/kvm/kvm_cache_regs.h | 6 +++---
arch/x86/kvm/ioapic.c | 2 +-
arch/x86/kvm/pmu_intel.c | 2 +-
arch/x86/kvm/vmx.c | 18 +++++++++---------
arch/x86/kvm/x86.c | 2 +-
6 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index ca0b0b4..3b58d41 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -102,7 +102,7 @@ static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
{
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
smp_rmb();
- return test_bit(vector, ioapic->handled_vectors);
+ return __test_bit(vector, ioapic->handled_vectors);
}
void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index e1e89ee..21ef6d6 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -9,7 +9,7 @@
static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
enum kvm_reg reg)
{
- if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
+ if (!__test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
kvm_x86_ops->cache_reg(vcpu, reg);
return vcpu->arch.regs[reg];
@@ -38,7 +38,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
{
might_sleep(); /* on svm */
- if (!test_bit(VCPU_EXREG_PDPTR,
+ if (!__test_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail))
kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
@@ -68,7 +68,7 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
{
- if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ if (!__test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
kvm_x86_ops->decache_cr3(vcpu);
return vcpu->arch.cr3;
}
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 856f791..bf2afa5 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -117,7 +117,7 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
return;
new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector);
- old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+ old_val = __test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
if (new_val == old_val)
return;
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
index ab38af4..fb20a0f 100644
--- a/arch/x86/kvm/pmu_intel.c
+++ b/arch/x86/kvm/pmu_intel.c
@@ -98,7 +98,7 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
- return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
+ return __test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
}
static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c117703..ed44026 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2025,7 +2025,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags, save_rflags;
- if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
+ if (!__test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
__set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
rflags = vmcs_readl(GUEST_RFLAGS);
if (to_vmx(vcpu)->rmode.vm86_active) {
@@ -3478,7 +3478,7 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- if (!test_bit(VCPU_EXREG_PDPTR,
+ if (!__test_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_dirty))
return;
@@ -3513,7 +3513,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
unsigned long cr0,
struct kvm_vcpu *vcpu)
{
- if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ if (!__test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
vmx_decache_cr3(vcpu);
if (!(cr0 & X86_CR0_PG)) {
/* From paging/starting to nonpaging */
@@ -4275,24 +4275,24 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
*/
if (msr <= 0x1fff) {
if (type & MSR_TYPE_R &&
- !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
+ !__test_bit(msr, msr_bitmap_l1 + 0x000 / f))
/* read-low */
__clear_bit(msr, msr_bitmap_nested + 0x000 / f);
if (type & MSR_TYPE_W &&
- !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
+ !__test_bit(msr, msr_bitmap_l1 + 0x800 / f))
/* write-low */
__clear_bit(msr, msr_bitmap_nested + 0x800 / f);
} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
msr &= 0x1fff;
if (type & MSR_TYPE_R &&
- !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
+ !__test_bit(msr, msr_bitmap_l1 + 0x400 / f))
/* read-high */
__clear_bit(msr, msr_bitmap_nested + 0x400 / f);
if (type & MSR_TYPE_W &&
- !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
+ !__test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
/* write-high */
__clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
@@ -8316,9 +8316,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->nested.sync_shadow_vmcs = false;
}
- if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+ if (__test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
- if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+ if (__test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
cr4 = cr4_read_shadow();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5ef2560..c8015fa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -555,7 +555,7 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
if (is_long_mode(vcpu) || !is_pae(vcpu))
return false;
- if (!test_bit(VCPU_EXREG_PDPTR,
+ if (!__test_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail))
return true;
--
MST
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] x86/bitops: implement __test_bit
2015-08-30 8:38 [PATCH 1/2] x86/bitops: implement __test_bit Michael S. Tsirkin
2015-08-30 8:38 ` [PATCH 2/2] kvm/x86: use __test_bit Michael S. Tsirkin
@ 2015-08-31 6:05 ` Ingo Molnar
2015-08-31 6:13 ` H. Peter Anvin
1 sibling, 1 reply; 15+ messages in thread
From: Ingo Molnar @ 2015-08-31 6:05 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: linux-kernel, Thomas Gleixner, Ingo Molnar, H. Peter Anvin, x86,
Rusty Russell, Linus Torvalds
* Michael S. Tsirkin <mst@redhat.com> wrote:
> +static __always_inline int __constant_test_bit(long nr, const unsigned long *addr)
> +{
> + return ((1UL << (nr & (BITS_PER_LONG-1))) &
> + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> +}
> +
> +static inline int __variable_test_bit(long nr, const unsigned long *addr)
> +{
> + int oldbit;
> +
> + asm volatile("bt %2,%1\n\t"
> + "sbb %0,%0"
> + : "=r" (oldbit)
> + : "m" (*addr), "Ir" (nr));
> +
> + return oldbit;
> +}
Color me confused, why use assembly for this at all?
Why not just use C for testing the bit (i.e. turn __constant_test_bit() into
__test_bit()) - that would also allow the compiler to propagate the result,
potentially more optimally than we can do it via SBB...
Thanks,
Ingo
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] x86/bitops: implement __test_bit
2015-08-31 6:05 ` [PATCH 1/2] x86/bitops: implement __test_bit Ingo Molnar
@ 2015-08-31 6:13 ` H. Peter Anvin
2015-08-31 7:56 ` Michael S. Tsirkin
0 siblings, 1 reply; 15+ messages in thread
From: H. Peter Anvin @ 2015-08-31 6:13 UTC (permalink / raw)
To: Ingo Molnar, Michael S. Tsirkin
Cc: linux-kernel, Thomas Gleixner, Ingo Molnar, x86, Rusty Russell,
Linus Torvalds
Presumably because gcc can't generate bt... whether or not it is worth it is another matter.
On August 30, 2015 11:05:49 PM PDT, Ingo Molnar <mingo@kernel.org> wrote:
>
>* Michael S. Tsirkin <mst@redhat.com> wrote:
>
>> +static __always_inline int __constant_test_bit(long nr, const
>unsigned long *addr)
>> +{
>> + return ((1UL << (nr & (BITS_PER_LONG-1))) &
>> + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
>> +}
>> +
>> +static inline int __variable_test_bit(long nr, const unsigned long
>*addr)
>> +{
>> + int oldbit;
>> +
>> + asm volatile("bt %2,%1\n\t"
>> + "sbb %0,%0"
>> + : "=r" (oldbit)
>> + : "m" (*addr), "Ir" (nr));
>> +
>> + return oldbit;
>> +}
>
>Color me confused, why use assembly for this at all?
>
>Why not just use C for testing the bit (i.e. turn __constant_test_bit()
>into
>__test_bit()) - that would also allow the compiler to propagate the
>result,
>potentially more optimally than we can do it via SBB...
>
>Thanks,
>
> Ingo
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] x86/bitops: implement __test_bit
2015-08-31 6:13 ` H. Peter Anvin
@ 2015-08-31 7:56 ` Michael S. Tsirkin
2015-08-31 7:59 ` Ingo Molnar
0 siblings, 1 reply; 15+ messages in thread
From: Michael S. Tsirkin @ 2015-08-31 7:56 UTC (permalink / raw)
To: H. Peter Anvin
Cc: Ingo Molnar, linux-kernel, Thomas Gleixner, Ingo Molnar, x86,
Rusty Russell, Linus Torvalds
On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
> Presumably because gcc can't generate bt... whether or not it is worth it is another matter.
>
> On August 30, 2015 11:05:49 PM PDT, Ingo Molnar <mingo@kernel.org> wrote:
> >
> >* Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> >> +static __always_inline int __constant_test_bit(long nr, const
> >unsigned long *addr)
> >> +{
> >> + return ((1UL << (nr & (BITS_PER_LONG-1))) &
> >> + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> >> +}
> >> +
> >> +static inline int __variable_test_bit(long nr, const unsigned long
> >*addr)
> >> +{
> >> + int oldbit;
> >> +
> >> + asm volatile("bt %2,%1\n\t"
> >> + "sbb %0,%0"
> >> + : "=r" (oldbit)
> >> + : "m" (*addr), "Ir" (nr));
> >> +
> >> + return oldbit;
> >> +}
> >
> >Color me confused, why use assembly for this at all?
> >
> >Why not just use C for testing the bit (i.e. turn __constant_test_bit()
> >into
> >__test_bit()) - that would also allow the compiler to propagate the
> >result,
> >potentially more optimally than we can do it via SBB...
> >
> >Thanks,
> >
> > Ingo
Exactly:
Disassembly of section .text:
00000000 <__variable_test_bit>:
__variable_test_bit():
0: 8b 54 24 08 mov 0x8(%esp),%edx
4: 8b 44 24 04 mov 0x4(%esp),%eax
8: 0f a3 02 bt %eax,(%edx)
b: 19 c0 sbb %eax,%eax
d: c3 ret
e: 66 90 xchg %ax,%ax
00000010 <__constant_test_bit>:
__constant_test_bit():
10: 8b 4c 24 04 mov 0x4(%esp),%ecx
14: 8b 44 24 08 mov 0x8(%esp),%eax
18: 89 ca mov %ecx,%edx
1a: c1 fa 04 sar $0x4,%edx
1d: 8b 04 90 mov (%eax,%edx,4),%eax
20: d3 e8 shr %cl,%eax
22: 83 e0 01 and $0x1,%eax
25: c3 ret
That's also probably why we still have variable_test_bit
for test_bit too. It's best to be consistent with that - do you agree?
Or would you rather drop that too?
--
MST
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] x86/bitops: implement __test_bit
2015-08-31 7:56 ` Michael S. Tsirkin
@ 2015-08-31 7:59 ` Ingo Molnar
2015-08-31 8:15 ` yalin wang
` (2 more replies)
0 siblings, 3 replies; 15+ messages in thread
From: Ingo Molnar @ 2015-08-31 7:59 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: H. Peter Anvin, linux-kernel, Thomas Gleixner, Ingo Molnar, x86,
Rusty Russell, Linus Torvalds
* Michael S. Tsirkin <mst@redhat.com> wrote:
> On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
> > Presumably because gcc can't generate bt... whether or not it is worth it is another matter.
> >
> > On August 30, 2015 11:05:49 PM PDT, Ingo Molnar <mingo@kernel.org> wrote:
> > >
> > >* Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > >> +static __always_inline int __constant_test_bit(long nr, const
> > >unsigned long *addr)
> > >> +{
> > >> + return ((1UL << (nr & (BITS_PER_LONG-1))) &
> > >> + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> > >> +}
> > >> +
> > >> +static inline int __variable_test_bit(long nr, const unsigned long
> > >*addr)
> > >> +{
> > >> + int oldbit;
> > >> +
> > >> + asm volatile("bt %2,%1\n\t"
> > >> + "sbb %0,%0"
> > >> + : "=r" (oldbit)
> > >> + : "m" (*addr), "Ir" (nr));
> > >> +
> > >> + return oldbit;
> > >> +}
> > >
> > >Color me confused, why use assembly for this at all?
> > >
> > >Why not just use C for testing the bit (i.e. turn __constant_test_bit()
> > >into
> > >__test_bit()) - that would also allow the compiler to propagate the
> > >result,
> > >potentially more optimally than we can do it via SBB...
> > >
> > >Thanks,
> > >
> > > Ingo
>
> Exactly:
>
>
> Disassembly of section .text:
>
> 00000000 <__variable_test_bit>:
> __variable_test_bit():
> 0: 8b 54 24 08 mov 0x8(%esp),%edx
> 4: 8b 44 24 04 mov 0x4(%esp),%eax
> 8: 0f a3 02 bt %eax,(%edx)
> b: 19 c0 sbb %eax,%eax
> d: c3 ret
> e: 66 90 xchg %ax,%ax
>
> 00000010 <__constant_test_bit>:
> __constant_test_bit():
> 10: 8b 4c 24 04 mov 0x4(%esp),%ecx
> 14: 8b 44 24 08 mov 0x8(%esp),%eax
> 18: 89 ca mov %ecx,%edx
> 1a: c1 fa 04 sar $0x4,%edx
> 1d: 8b 04 90 mov (%eax,%edx,4),%eax
> 20: d3 e8 shr %cl,%eax
> 22: 83 e0 01 and $0x1,%eax
> 25: c3 ret
But that's due to the forced interface of generating a return code. Please compare
it at an inlined usage site, where GCC is free to do the comparison directly and
use the result in flags.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] x86/bitops: implement __test_bit
2015-08-31 7:59 ` Ingo Molnar
@ 2015-08-31 8:15 ` yalin wang
2015-08-31 8:19 ` Ingo Molnar
2015-08-31 8:15 ` Ingo Molnar
2015-08-31 11:19 ` Michael S. Tsirkin
2 siblings, 1 reply; 15+ messages in thread
From: yalin wang @ 2015-08-31 8:15 UTC (permalink / raw)
To: Ingo Molnar
Cc: Michael S. Tsirkin, H. Peter Anvin, lkml, Thomas Gleixner,
Ingo Molnar, x86, Rusty Russell, Linus Torvalds
> On Aug 31, 2015, at 15:59, Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Michael S. Tsirkin <mst@redhat.com> wrote:
>
>> On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
>>> Presumably because gcc can't generate bt... whether or not it is worth it is another matter.
>>>
>>> On August 30, 2015 11:05:49 PM PDT, Ingo Molnar <mingo@kernel.org> wrote:
>>>>
>>>> * Michael S. Tsirkin <mst@redhat.com> wrote:
>>>>
>>>>> +static __always_inline int __constant_test_bit(long nr, const
>>>> unsigned long *addr)
>>>>> +{
>>>>> + return ((1UL << (nr & (BITS_PER_LONG-1))) &
>>>>> + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
>>>>> +}
>>>>> +
>>>>> +static inline int __variable_test_bit(long nr, const unsigned long
>>>> *addr)
>>>>> +{
>>>>> + int oldbit;
>>>>> +
>>>>> + asm volatile("bt %2,%1\n\t"
>>>>> + "sbb %0,%0"
>>>>> + : "=r" (oldbit)
>>>>> + : "m" (*addr), "Ir" (nr));
>>>>> +
>>>>> + return oldbit;
>>>>> +}
>>>>
>>>> Color me confused, why use assembly for this at all?
>>>>
>>>> Why not just use C for testing the bit (i.e. turn __constant_test_bit()
>>>> into
>>>> __test_bit()) - that would also allow the compiler to propagate the
>>>> result,
>>>> potentially more optimally than we can do it via SBB...
>>>>
>>>> Thanks,
>>>>
>>>> Ingo
>>
>> Exactly:
>>
>>
>> Disassembly of section .text:
>>
>> 00000000 <__variable_test_bit>:
>> __variable_test_bit():
>> 0: 8b 54 24 08 mov 0x8(%esp),%edx
>> 4: 8b 44 24 04 mov 0x4(%esp),%eax
>> 8: 0f a3 02 bt %eax,(%edx)
>> b: 19 c0 sbb %eax,%eax
>> d: c3 ret
>> e: 66 90 xchg %ax,%ax
>>
>> 00000010 <__constant_test_bit>:
>> __constant_test_bit():
>> 10: 8b 4c 24 04 mov 0x4(%esp),%ecx
>> 14: 8b 44 24 08 mov 0x8(%esp),%eax
>> 18: 89 ca mov %ecx,%edx
>> 1a: c1 fa 04 sar $0x4,%edx
>> 1d: 8b 04 90 mov (%eax,%edx,4),%eax
>> 20: d3 e8 shr %cl,%eax
>> 22: 83 e0 01 and $0x1,%eax
>> 25: c3 ret
>
> But that's due to the forced interface of generating a return code. Please compare
> it at an inlined usage site, where GCC is free to do the comparison directly and
> use the result in flags.
just curious :
it seems __variable_test_bit() use less instructions,
why not always use __variable_test_bit() , remove __constant_test_bit() version ?
Thanks
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] x86/bitops: implement __test_bit
2015-08-31 7:59 ` Ingo Molnar
2015-08-31 8:15 ` yalin wang
@ 2015-08-31 8:15 ` Ingo Molnar
2015-08-31 11:19 ` Michael S. Tsirkin
2 siblings, 0 replies; 15+ messages in thread
From: Ingo Molnar @ 2015-08-31 8:15 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: H. Peter Anvin, linux-kernel, Thomas Gleixner, Ingo Molnar, x86,
Rusty Russell, Linus Torvalds
* Ingo Molnar <mingo@kernel.org> wrote:
> > Disassembly of section .text:
> >
> > 00000000 <__variable_test_bit>:
> > __variable_test_bit():
> > 0: 8b 54 24 08 mov 0x8(%esp),%edx
> > 4: 8b 44 24 04 mov 0x4(%esp),%eax
> > 8: 0f a3 02 bt %eax,(%edx)
> > b: 19 c0 sbb %eax,%eax
> > d: c3 ret
> > e: 66 90 xchg %ax,%ax
> >
> > 00000010 <__constant_test_bit>:
> > __constant_test_bit():
> > 10: 8b 4c 24 04 mov 0x4(%esp),%ecx
> > 14: 8b 44 24 08 mov 0x8(%esp),%eax
> > 18: 89 ca mov %ecx,%edx
> > 1a: c1 fa 04 sar $0x4,%edx
> > 1d: 8b 04 90 mov (%eax,%edx,4),%eax
> > 20: d3 e8 shr %cl,%eax
> > 22: 83 e0 01 and $0x1,%eax
> > 25: c3 ret
>
> But that's due to the forced interface of generating a return code. Please
> compare it at an inlined usage site, where GCC is free to do the comparison
> directly and use the result in flags.
So I was thinking about the patch below on top of yours.
But turns out GCC indeed generates worse code even under the best of
circumstances. For example the nested_vmx_disable_intercept_for_msr() change:
@@ -4275,24 +4275,24 @@ static void nested_vmx_disable_intercept
*/
if (msr <= 0x1fff) {
if (type & MSR_TYPE_R &&
- !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
+ !__test_bit(msr, msr_bitmap_l1 + 0x000 / f))
/* read-low */
__clear_bit(msr, msr_bitmap_nested + 0x000 / f);
before (i.e. your series):
ffffffff818b1082: 89 d0 mov %edx,%eax
ffffffff818b1084: 48 0f a3 07 bt %rax,(%rdi)
ffffffff818b1088: 45 19 c0 sbb %r8d,%r8d
ffffffff818b108b: 45 85 c0 test %r8d,%r8d
ffffffff818b108e: 75 04 jne ffffffff818b1094 <nested_vmx_disable_intercept_for_msr+0x43>
after (with my 'optimization' patch applied):
ffffffff818b1091: 89 d0 mov %edx,%eax
ffffffff818b1093: 49 89 c0 mov %rax,%r8
ffffffff818b1096: 49 c1 f8 06 sar $0x6,%r8
ffffffff818b109a: 4e 8b 04 c7 mov (%rdi,%r8,8),%r8
ffffffff818b109e: 49 0f a3 d0 bt %rdx,%r8
ffffffff818b10a2: 72 04 jb ffffffff818b10a8 <nested_vmx_disable_intercept_for_msr+0x48>
So GCC when left to its own devices, generates one more instruction and 4 more
bytes. Why does GCC do that? Why doesn't it use BT directly and use the flag, i.e.
something like [pseudocode]:
ffffffff818b1082: 89 d0 mov %edx,%eax
ffffffff818b1084: 48 0f a3 07 bt %rax,(%rdi)
ffffffff818b108e: 75 04 jne ffffffff818b1094 <nested_vmx_disable_intercept_for_msr+0x43>
?
In any case I take back my objection:
Acked-by: Ingo Molnar <mingo@kernel.org>
Thanks,
Ingo
---
arch/x86/include/asm/bitops.h | 19 +------------------
1 file changed, 1 insertion(+), 18 deletions(-)
Index: tip/arch/x86/include/asm/bitops.h
===================================================================
--- tip.orig/arch/x86/include/asm/bitops.h
+++ tip/arch/x86/include/asm/bitops.h
@@ -323,24 +323,12 @@ static inline int variable_test_bit(long
return oldbit;
}
-static __always_inline int __constant_test_bit(long nr, const unsigned long *addr)
+static __always_inline int __test_bit(long nr, const unsigned long *addr)
{
return ((1UL << (nr & (BITS_PER_LONG-1))) &
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}
-static inline int __variable_test_bit(long nr, const unsigned long *addr)
-{
- int oldbit;
-
- asm volatile("bt %2,%1\n\t"
- "sbb %0,%0"
- : "=r" (oldbit)
- : "m" (*addr), "Ir" (nr));
-
- return oldbit;
-}
-
#if 0 /* Fool kernel-doc since it doesn't do macros yet */
/**
* test_bit - Determine whether a bit is set
@@ -362,11 +350,6 @@ static int __test_bit(int nr, const vola
? constant_test_bit((nr), (addr)) \
: variable_test_bit((nr), (addr)))
-#define __test_bit(nr, addr) \
- (__builtin_constant_p((nr)) \
- ? __constant_test_bit((nr), (addr)) \
- : __variable_test_bit((nr), (addr)))
-
/**
* __ffs - find first set bit in word
* @word: The word to search
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] x86/bitops: implement __test_bit
2015-08-31 8:15 ` yalin wang
@ 2015-08-31 8:19 ` Ingo Molnar
0 siblings, 0 replies; 15+ messages in thread
From: Ingo Molnar @ 2015-08-31 8:19 UTC (permalink / raw)
To: yalin wang
Cc: Michael S. Tsirkin, H. Peter Anvin, lkml, Thomas Gleixner,
Ingo Molnar, x86, Rusty Russell, Linus Torvalds
* yalin wang <yalin.wang2010@gmail.com> wrote:
>
> > On Aug 31, 2015, at 15:59, Ingo Molnar <mingo@kernel.org> wrote:
> >
> >
> > * Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> >> On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
> >>> Presumably because gcc can't generate bt... whether or not it is worth it is another matter.
> >>>
> >>> On August 30, 2015 11:05:49 PM PDT, Ingo Molnar <mingo@kernel.org> wrote:
> >>>>
> >>>> * Michael S. Tsirkin <mst@redhat.com> wrote:
> >>>>
> >>>>> +static __always_inline int __constant_test_bit(long nr, const
> >>>> unsigned long *addr)
> >>>>> +{
> >>>>> + return ((1UL << (nr & (BITS_PER_LONG-1))) &
> >>>>> + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> >>>>> +}
> >>>>> +
> >>>>> +static inline int __variable_test_bit(long nr, const unsigned long
> >>>> *addr)
> >>>>> +{
> >>>>> + int oldbit;
> >>>>> +
> >>>>> + asm volatile("bt %2,%1\n\t"
> >>>>> + "sbb %0,%0"
> >>>>> + : "=r" (oldbit)
> >>>>> + : "m" (*addr), "Ir" (nr));
> >>>>> +
> >>>>> + return oldbit;
> >>>>> +}
> >>>>
> >>>> Color me confused, why use assembly for this at all?
> >>>>
> >>>> Why not just use C for testing the bit (i.e. turn __constant_test_bit()
> >>>> into
> >>>> __test_bit()) - that would also allow the compiler to propagate the
> >>>> result,
> >>>> potentially more optimally than we can do it via SBB...
> >>>>
> >>>> Thanks,
> >>>>
> >>>> Ingo
> >>
> >> Exactly:
> >>
> >>
> >> Disassembly of section .text:
> >>
> >> 00000000 <__variable_test_bit>:
> >> __variable_test_bit():
> >> 0: 8b 54 24 08 mov 0x8(%esp),%edx
> >> 4: 8b 44 24 04 mov 0x4(%esp),%eax
> >> 8: 0f a3 02 bt %eax,(%edx)
> >> b: 19 c0 sbb %eax,%eax
> >> d: c3 ret
> >> e: 66 90 xchg %ax,%ax
> >>
> >> 00000010 <__constant_test_bit>:
> >> __constant_test_bit():
> >> 10: 8b 4c 24 04 mov 0x4(%esp),%ecx
> >> 14: 8b 44 24 08 mov 0x8(%esp),%eax
> >> 18: 89 ca mov %ecx,%edx
> >> 1a: c1 fa 04 sar $0x4,%edx
> >> 1d: 8b 04 90 mov (%eax,%edx,4),%eax
> >> 20: d3 e8 shr %cl,%eax
> >> 22: 83 e0 01 and $0x1,%eax
> >> 25: c3 ret
> >
> > But that's due to the forced interface of generating a return code. Please compare
> > it at an inlined usage site, where GCC is free to do the comparison directly and
> > use the result in flags.
> just curious :
> it seems __variable_test_bit() use less instructions,
> why not always use __variable_test_bit() , remove __constant_test_bit() version ?
It's an artifact of testing it in isolation.
For constant bit positions GCC is able to do a fairly good job:
ffffffff8103d2a0 <vmx_get_rflags>:
ffffffff8103d2a0: f6 87 4a 02 00 00 08 testb $0x8,0x24a(%rdi)
...
ffffffff8103d2ab: 75 39 jne ffffffff8103d2e6 <vmx_get_rflags+0x46>
with just 2 instructions: a TESTB plus using the flag result in a JNE.
Using variable_test_bit() forces the result into a register, which is suboptimal.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] x86/bitops: implement __test_bit
2015-08-31 7:59 ` Ingo Molnar
2015-08-31 8:15 ` yalin wang
2015-08-31 8:15 ` Ingo Molnar
@ 2015-08-31 11:19 ` Michael S. Tsirkin
2015-09-01 9:24 ` Ingo Molnar
2 siblings, 1 reply; 15+ messages in thread
From: Michael S. Tsirkin @ 2015-08-31 11:19 UTC (permalink / raw)
To: Ingo Molnar
Cc: H. Peter Anvin, linux-kernel, Thomas Gleixner, Ingo Molnar, x86,
Rusty Russell, Linus Torvalds
On Mon, Aug 31, 2015 at 09:59:47AM +0200, Ingo Molnar wrote:
>
> * Michael S. Tsirkin <mst@redhat.com> wrote:
>
> > On Sun, Aug 30, 2015 at 11:13:20PM -0700, H. Peter Anvin wrote:
> > > Presumably because gcc can't generate bt... whether or not it is worth it is another matter.
> > >
> > > On August 30, 2015 11:05:49 PM PDT, Ingo Molnar <mingo@kernel.org> wrote:
> > > >
> > > >* Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > >> +static __always_inline int __constant_test_bit(long nr, const
> > > >unsigned long *addr)
> > > >> +{
> > > >> + return ((1UL << (nr & (BITS_PER_LONG-1))) &
> > > >> + (addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
> > > >> +}
> > > >> +
> > > >> +static inline int __variable_test_bit(long nr, const unsigned long
> > > >*addr)
> > > >> +{
> > > >> + int oldbit;
> > > >> +
> > > >> + asm volatile("bt %2,%1\n\t"
> > > >> + "sbb %0,%0"
> > > >> + : "=r" (oldbit)
> > > >> + : "m" (*addr), "Ir" (nr));
> > > >> +
> > > >> + return oldbit;
> > > >> +}
> > > >
> > > >Color me confused, why use assembly for this at all?
> > > >
> > > >Why not just use C for testing the bit (i.e. turn __constant_test_bit()
> > > >into
> > > >__test_bit()) - that would also allow the compiler to propagate the
> > > >result,
> > > >potentially more optimally than we can do it via SBB...
> > > >
> > > >Thanks,
> > > >
> > > > Ingo
> >
> > Exactly:
> >
> >
> > Disassembly of section .text:
> >
> > 00000000 <__variable_test_bit>:
> > __variable_test_bit():
> > 0: 8b 54 24 08 mov 0x8(%esp),%edx
> > 4: 8b 44 24 04 mov 0x4(%esp),%eax
> > 8: 0f a3 02 bt %eax,(%edx)
> > b: 19 c0 sbb %eax,%eax
> > d: c3 ret
> > e: 66 90 xchg %ax,%ax
> >
> > 00000010 <__constant_test_bit>:
> > __constant_test_bit():
> > 10: 8b 4c 24 04 mov 0x4(%esp),%ecx
> > 14: 8b 44 24 08 mov 0x8(%esp),%eax
> > 18: 89 ca mov %ecx,%edx
> > 1a: c1 fa 04 sar $0x4,%edx
> > 1d: 8b 04 90 mov (%eax,%edx,4),%eax
> > 20: d3 e8 shr %cl,%eax
> > 22: 83 e0 01 and $0x1,%eax
> > 25: c3 ret
>
> But that's due to the forced interface of generating a return code. Please compare
> it at an inlined usage site, where GCC is free to do the comparison directly and
> use the result in flags.
>
> Thanks,
>
> Ingo
I applied this patch on top of mine:
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 9229334..2aed985 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -323,24 +323,17 @@ static inline int variable_test_bit(long nr, volatile const unsigned long *addr)
return oldbit;
}
-static __always_inline int __constant_test_bit(long nr, const unsigned long *addr)
+/**
+ * __test_bit - Determine whether a bit is set
+ * @nr: bit number to test
+ * @addr: Address to start counting from
+ */
+static __always_inline int __test_bit(long nr, const unsigned long *addr)
{
return ((1UL << (nr & (BITS_PER_LONG-1))) &
(addr[nr >> _BITOPS_LONG_SHIFT])) != 0;
}
-static inline int __variable_test_bit(long nr, const unsigned long *addr)
-{
- int oldbit;
-
- asm volatile("bt %2,%1\n\t"
- "sbb %0,%0"
- : "=r" (oldbit)
- : "m" (*addr), "Ir" (nr));
-
- return oldbit;
-}
-
#if 0 /* Fool kernel-doc since it doesn't do macros yet */
/**
* test_bit - Determine whether a bit is set
@@ -348,13 +341,6 @@ static inline int __variable_test_bit(long nr, const unsigned long *addr)
* @addr: Address to start counting from
*/
static int test_bit(int nr, const volatile unsigned long *addr);
-
-/**
- * __test_bit - Determine whether a bit is set
- * @nr: bit number to test
- * @addr: Address to start counting from
- */
-static int __test_bit(int nr, const volatile unsigned long *addr);
#endif
#define test_bit(nr, addr) \
@@ -362,10 +348,6 @@ static int __test_bit(int nr, const volatile unsigned long *addr);
? constant_test_bit((nr), (addr)) \
: variable_test_bit((nr), (addr)))
-#define __test_bit(nr, addr) \
- (__builtin_constant_p((nr)) \
- ? __constant_test_bit((nr), (addr)) \
- : __variable_test_bit((nr), (addr)))
/**
* __ffs - find first set bit in word
And the code size went up:
134836 2997 8372 146205 23b1d arch/x86/kvm/kvm-intel.ko ->
134846 2997 8372 146215 23b27 arch/x86/kvm/kvm-intel.ko
342690 47640 441 390771 5f673 arch/x86/kvm/kvm.ko ->
342738 47640 441 390819 5f6a3 arch/x86/kvm/kvm.ko
I tried removing __always_inline, this had no effect.
--
MST
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] x86/bitops: implement __test_bit
2015-08-31 11:19 ` Michael S. Tsirkin
@ 2015-09-01 9:24 ` Ingo Molnar
2015-09-01 9:40 ` Michael S. Tsirkin
0 siblings, 1 reply; 15+ messages in thread
From: Ingo Molnar @ 2015-09-01 9:24 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: H. Peter Anvin, linux-kernel, Thomas Gleixner, Ingo Molnar, x86,
Rusty Russell, Linus Torvalds, Uros Bizjak
* Michael S. Tsirkin <mst@redhat.com> wrote:
> I applied this patch on top of mine:
Yeah, looks similar to the one I sent.
> -static inline int __variable_test_bit(long nr, const unsigned long *addr)
> -{
> - int oldbit;
> -
> - asm volatile("bt %2,%1\n\t"
> - "sbb %0,%0"
> - : "=r" (oldbit)
> - : "m" (*addr), "Ir" (nr));
> -
> - return oldbit;
> -}
> And the code size went up:
>
> 134836 2997 8372 146205 23b1d arch/x86/kvm/kvm-intel.ko ->
> 134846 2997 8372 146215 23b27 arch/x86/kvm/kvm-intel.ko
>
> 342690 47640 441 390771 5f673 arch/x86/kvm/kvm.ko ->
> 342738 47640 441 390819 5f6a3 arch/x86/kvm/kvm.ko
>
> I tried removing __always_inline, this had no effect.
But code size isn't the only factor.
Uros Bizjak pointed out that the reason GCC does not use the "BT reg,mem"
instruction is that it's highly suboptimal even on recent microarchitectures,
Sandy Bridge is listed as having a 10 cycles latency (!) for this instruction:
http://www.agner.org/optimize/instruction_tables.pdf
this instruction had bad latency going back to Pentium 4 CPUs.
... so unless something changed in this area with Skylake I think using the
__variable_test_bit() code of the kernel is a bad choice and looking at kernel
size only is misleading.
It makes sense for atomics, but not for unlocked access.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] x86/bitops: implement __test_bit
2015-09-01 9:24 ` Ingo Molnar
@ 2015-09-01 9:40 ` Michael S. Tsirkin
2015-09-01 11:39 ` Ingo Molnar
0 siblings, 1 reply; 15+ messages in thread
From: Michael S. Tsirkin @ 2015-09-01 9:40 UTC (permalink / raw)
To: Ingo Molnar
Cc: H. Peter Anvin, linux-kernel, Thomas Gleixner, Ingo Molnar, x86,
Rusty Russell, Linus Torvalds, Uros Bizjak
On Tue, Sep 01, 2015 at 11:24:22AM +0200, Ingo Molnar wrote:
>
> * Michael S. Tsirkin <mst@redhat.com> wrote:
>
> > I applied this patch on top of mine:
>
> Yeah, looks similar to the one I sent.
>
> > -static inline int __variable_test_bit(long nr, const unsigned long *addr)
> > -{
> > - int oldbit;
> > -
> > - asm volatile("bt %2,%1\n\t"
> > - "sbb %0,%0"
> > - : "=r" (oldbit)
> > - : "m" (*addr), "Ir" (nr));
> > -
> > - return oldbit;
> > -}
>
> > And the code size went up:
> >
> > 134836 2997 8372 146205 23b1d arch/x86/kvm/kvm-intel.ko ->
> > 134846 2997 8372 146215 23b27 arch/x86/kvm/kvm-intel.ko
> >
> > 342690 47640 441 390771 5f673 arch/x86/kvm/kvm.ko ->
> > 342738 47640 441 390819 5f6a3 arch/x86/kvm/kvm.ko
> >
> > I tried removing __always_inline, this had no effect.
>
> But code size isn't the only factor.
>
> Uros Bizjak pointed out that the reason GCC does not use the "BT reg,mem"
> instruction is that it's highly suboptimal even on recent microarchitectures,
> Sandy Bridge is listed as having a 10 cycles latency (!) for this instruction:
>
> http://www.agner.org/optimize/instruction_tables.pdf
>
> this instruction had bad latency going back to Pentium 4 CPUs.
>
> ... so unless something changed in this area with Skylake I think using the
> __variable_test_bit() code of the kernel is a bad choice and looking at kernel
> size only is misleading.
>
> It makes sense for atomics, but not for unlocked access.
>
> Thanks,
>
> Ingo
Hmm - so do you take back the ack?
I compared this:
int main(int argc, char **argv)
{
long long int i;
const unsigned long addr = 0;
for (i = 0; i < 1000000000ull; ++i) {
asm volatile("");
if (__variable_test_bit(1, &addr))
asm volatile("");
}
return 0;
}
with the __constant_test_bit variant.
__constant_test_bit code does appear to be slower on an i7 processor.
test_bit isn't atomic either. Maybe drop variable_test_bit there too?
--
MST
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] x86/bitops: implement __test_bit
2015-09-01 9:40 ` Michael S. Tsirkin
@ 2015-09-01 11:39 ` Ingo Molnar
2015-09-01 15:03 ` Michael S. Tsirkin
0 siblings, 1 reply; 15+ messages in thread
From: Ingo Molnar @ 2015-09-01 11:39 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: H. Peter Anvin, linux-kernel, Thomas Gleixner, Ingo Molnar, x86,
Rusty Russell, Linus Torvalds, Uros Bizjak
* Michael S. Tsirkin <mst@redhat.com> wrote:
> On Tue, Sep 01, 2015 at 11:24:22AM +0200, Ingo Molnar wrote:
> >
> > * Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > > I applied this patch on top of mine:
> >
> > Yeah, looks similar to the one I sent.
> >
> > > -static inline int __variable_test_bit(long nr, const unsigned long *addr)
> > > -{
> > > - int oldbit;
> > > -
> > > - asm volatile("bt %2,%1\n\t"
> > > - "sbb %0,%0"
> > > - : "=r" (oldbit)
> > > - : "m" (*addr), "Ir" (nr));
> > > -
> > > - return oldbit;
> > > -}
> >
> > > And the code size went up:
> > >
> > > 134836 2997 8372 146205 23b1d arch/x86/kvm/kvm-intel.ko ->
> > > 134846 2997 8372 146215 23b27 arch/x86/kvm/kvm-intel.ko
> > >
> > > 342690 47640 441 390771 5f673 arch/x86/kvm/kvm.ko ->
> > > 342738 47640 441 390819 5f6a3 arch/x86/kvm/kvm.ko
> > >
> > > I tried removing __always_inline, this had no effect.
> >
> > But code size isn't the only factor.
> >
> > Uros Bizjak pointed out that the reason GCC does not use the "BT reg,mem"
> > instruction is that it's highly suboptimal even on recent microarchitectures,
> > Sandy Bridge is listed as having a 10 cycles latency (!) for this instruction:
> >
> > http://www.agner.org/optimize/instruction_tables.pdf
> >
> > this instruction had bad latency going back to Pentium 4 CPUs.
> >
> > ... so unless something changed in this area with Skylake I think using the
> > __variable_test_bit() code of the kernel is a bad choice and looking at kernel
> > size only is misleading.
> >
> > It makes sense for atomics, but not for unlocked access.
> >
> > Thanks,
> >
> > Ingo
>
> Hmm - so do you take back the ack?
I have no strong feelings either way, it simply strikes me as misguided to
explicitly optimize for something that is listed as a high overhead instruction.
Assuming it really is high overhead:
> I compared this:
> int main(int argc, char **argv)
> {
>
> long long int i;
> const unsigned long addr = 0;
> for (i = 0; i < 1000000000ull; ++i) {
> asm volatile("");
> if (__variable_test_bit(1, &addr))
> asm volatile("");
> }
> return 0;
> }
>
> with the __constant_test_bit variant.
>
> __constant_test_bit code does appear to be slower on an i7 processor.
Hm, so this seems to be contradictory: if I'm right with the argument above then
we'd expect the opposite result: variable_test_bit (BT using asm variant) should
be slower than constant_test_bit (GCC version), correct?
Btw., to be sure it's a representative performance test instead of a barrier() in
your testcase I'd actually do something with the result in a way neither the
compiler nor the CPU can optimize it out as unused.
> test_bit isn't atomic either. Maybe drop variable_test_bit there too?
Yes, but only if I'm right about BT being suboptimal in this case on modern x86
CPUs!
Thanks,
Ingo
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] x86/bitops: implement __test_bit
2015-09-01 11:39 ` Ingo Molnar
@ 2015-09-01 15:03 ` Michael S. Tsirkin
2015-09-01 23:48 ` H. Peter Anvin
0 siblings, 1 reply; 15+ messages in thread
From: Michael S. Tsirkin @ 2015-09-01 15:03 UTC (permalink / raw)
To: Ingo Molnar
Cc: H. Peter Anvin, linux-kernel, Thomas Gleixner, Ingo Molnar, x86,
Rusty Russell, Linus Torvalds, Uros Bizjak
On Tue, Sep 01, 2015 at 01:39:42PM +0200, Ingo Molnar wrote:
>
> * Michael S. Tsirkin <mst@redhat.com> wrote:
>
> > On Tue, Sep 01, 2015 at 11:24:22AM +0200, Ingo Molnar wrote:
> > >
> > > * Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > > I applied this patch on top of mine:
> > >
> > > Yeah, looks similar to the one I sent.
> > >
> > > > -static inline int __variable_test_bit(long nr, const unsigned long *addr)
> > > > -{
> > > > - int oldbit;
> > > > -
> > > > - asm volatile("bt %2,%1\n\t"
> > > > - "sbb %0,%0"
> > > > - : "=r" (oldbit)
> > > > - : "m" (*addr), "Ir" (nr));
> > > > -
> > > > - return oldbit;
> > > > -}
> > >
> > > > And the code size went up:
> > > >
> > > > 134836 2997 8372 146205 23b1d arch/x86/kvm/kvm-intel.ko ->
> > > > 134846 2997 8372 146215 23b27 arch/x86/kvm/kvm-intel.ko
> > > >
> > > > 342690 47640 441 390771 5f673 arch/x86/kvm/kvm.ko ->
> > > > 342738 47640 441 390819 5f6a3 arch/x86/kvm/kvm.ko
> > > >
> > > > I tried removing __always_inline, this had no effect.
> > >
> > > But code size isn't the only factor.
> > >
> > > Uros Bizjak pointed out that the reason GCC does not use the "BT reg,mem"
> > > instruction is that it's highly suboptimal even on recent microarchitectures,
> > > Sandy Bridge is listed as having a 10 cycles latency (!) for this instruction:
> > >
> > > http://www.agner.org/optimize/instruction_tables.pdf
> > >
> > > this instruction had bad latency going back to Pentium 4 CPUs.
> > >
> > > ... so unless something changed in this area with Skylake I think using the
> > > __variable_test_bit() code of the kernel is a bad choice and looking at kernel
> > > size only is misleading.
> > >
> > > It makes sense for atomics, but not for unlocked access.
> > >
> > > Thanks,
> > >
> > > Ingo
> >
> > Hmm - so do you take back the ack?
>
> I have no strong feelings either way, it simply strikes me as misguided to
> explicitly optimize for something that is listed as a high overhead instruction.
>
> Assuming it really is high overhead:
>
> > I compared this:
> > int main(int argc, char **argv)
> > {
> >
> > long long int i;
> > const unsigned long addr = 0;
> > for (i = 0; i < 1000000000ull; ++i) {
> > asm volatile("");
> > if (__variable_test_bit(1, &addr))
> > asm volatile("");
> > }
> > return 0;
> > }
> >
> > with the __constant_test_bit variant.
> >
> > __constant_test_bit code does appear to be slower on an i7 processor.
Ouch. I meant the reverse:
[mst@robin test]$ diff a.c b.c
31c31
< if (__variable_test_bit(1, &addr))
---
> if (__constant_test_bit(1, &addr))
[mst@robin test]$ gcc -Wall -O2 a.c; time ./a.out
real 0m0.532s
user 0m0.531s
sys 0m0.000s
[mst@robin test]$ gcc -Wall -O2 b.c; time ./a.out
real 0m0.517s
user 0m0.517s
sys 0m0.000s
So __constant_test_bit is faster even though it's using more
instructions
$ gcc -Wall -O2 a.c; -objdump -ld ./a.out
08048414 <main>:
main():
8048414: 8d 4c 24 04 lea 0x4(%esp),%ecx
8048418: 83 e4 f8 and $0xfffffff8,%esp
804841b: ff 71 fc pushl -0x4(%ecx)
804841e: 55 push %ebp
804841f: 89 e5 mov %esp,%ebp
8048421: 51 push %ecx
8048422: 83 ec 14 sub $0x14,%esp
8048425: c7 45 ec 00 00 00 00 movl $0x0,-0x14(%ebp)
804842c: c7 45 f0 00 00 00 00 movl $0x0,-0x10(%ebp)
8048433: c7 45 f4 00 00 00 00 movl $0x0,-0xc(%ebp)
804843a: eb 1a jmp 8048456 <main+0x42>
804843c: 8d 45 ec lea -0x14(%ebp),%eax
804843f: 50 push %eax
8048440: 6a 01 push $0x1
8048442: e8 b4 ff ff ff call 80483fb <__variable_test_bit>
8048447: 83 c4 08 add $0x8,%esp
804844a: 85 c0 test %eax,%eax
804844c: 74 00 je 804844e <main+0x3a>
804844e: 83 45 f0 01 addl $0x1,-0x10(%ebp)
8048452: 83 55 f4 00 adcl $0x0,-0xc(%ebp)
8048456: 8b 45 f0 mov -0x10(%ebp),%eax
8048459: 8b 55 f4 mov -0xc(%ebp),%edx
804845c: 83 fa 00 cmp $0x0,%edx
804845f: 72 db jb 804843c <main+0x28>
8048461: 83 fa 00 cmp $0x0,%edx
8048464: 77 07 ja 804846d <main+0x59>
8048466: 3d ff c9 9a 3b cmp $0x3b9ac9ff,%eax
804846b: 76 cf jbe 804843c <main+0x28>
804846d: b8 00 00 00 00 mov $0x0,%eax
8048472: 8b 4d fc mov -0x4(%ebp),%ecx
8048475: c9 leave
8048476: 8d 61 fc lea -0x4(%ecx),%esp
8048479: c3 ret
804847a: 66 90 xchg %ax,%ax
804847c: 66 90 xchg %ax,%ax
804847e: 66 90 xchg %ax,%ax
$ gcc -Wall -O2 b.c; -objdump -ld ./a.out
080483fb <main>:
main():
80483fb: 8d 4c 24 04 lea 0x4(%esp),%ecx
80483ff: 83 e4 f8 and $0xfffffff8,%esp
8048402: ff 71 fc pushl -0x4(%ecx)
8048405: 55 push %ebp
8048406: 89 e5 mov %esp,%ebp
8048408: 51 push %ecx
8048409: 83 ec 24 sub $0x24,%esp
804840c: c7 45 e4 00 00 00 00 movl $0x0,-0x1c(%ebp)
8048413: c7 45 f0 00 00 00 00 movl $0x0,-0x10(%ebp)
804841a: c7 45 f4 00 00 00 00 movl $0x0,-0xc(%ebp)
8048421: eb 44 jmp 8048467 <main+0x6c>
8048423: c7 45 ec 01 00 00 00 movl $0x1,-0x14(%ebp)
804842a: 8d 45 e4 lea -0x1c(%ebp),%eax
804842d: 89 45 e8 mov %eax,-0x18(%ebp)
8048430: 8b 45 ec mov -0x14(%ebp),%eax
8048433: c1 f8 05 sar $0x5,%eax
8048436: 8d 14 85 00 00 00 00 lea 0x0(,%eax,4),%edx
804843d: 8b 45 e8 mov -0x18(%ebp),%eax
8048440: 01 d0 add %edx,%eax
8048442: 8b 10 mov (%eax),%edx
8048444: 8b 45 ec mov -0x14(%ebp),%eax
8048447: 83 e0 1f and $0x1f,%eax
804844a: 89 c1 mov %eax,%ecx
804844c: d3 ea shr %cl,%edx
804844e: 89 d0 mov %edx,%eax
8048450: 83 e0 01 and $0x1,%eax
8048453: 85 c0 test %eax,%eax
8048455: 0f 95 c0 setne %al
8048458: 0f b6 c0 movzbl %al,%eax
804845b: 85 c0 test %eax,%eax
804845d: 74 00 je 804845f <main+0x64>
804845f: 83 45 f0 01 addl $0x1,-0x10(%ebp)
8048463: 83 55 f4 00 adcl $0x0,-0xc(%ebp)
8048467: 8b 45 f0 mov -0x10(%ebp),%eax
804846a: 8b 55 f4 mov -0xc(%ebp),%edx
804846d: 83 fa 00 cmp $0x0,%edx
8048470: 72 b1 jb 8048423 <main+0x28>
8048472: 83 fa 00 cmp $0x0,%edx
8048475: 77 07 ja 804847e <main+0x83>
8048477: 3d ff c9 9a 3b cmp $0x3b9ac9ff,%eax
804847c: 76 a5 jbe 8048423 <main+0x28>
804847e: b8 00 00 00 00 mov $0x0,%eax
8048483: 83 c4 24 add $0x24,%esp
8048486: 59 pop %ecx
8048487: 5d pop %ebp
8048488: 8d 61 fc lea -0x4(%ecx),%esp
804848b: c3 ret
804848c: 66 90 xchg %ax,%ax
804848e: 66 90 xchg %ax,%ax
--
MST
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] x86/bitops: implement __test_bit
2015-09-01 15:03 ` Michael S. Tsirkin
@ 2015-09-01 23:48 ` H. Peter Anvin
0 siblings, 0 replies; 15+ messages in thread
From: H. Peter Anvin @ 2015-09-01 23:48 UTC (permalink / raw)
To: Michael S. Tsirkin, Ingo Molnar
Cc: linux-kernel, Thomas Gleixner, Ingo Molnar, x86, Rusty Russell,
Linus Torvalds, Uros Bizjak
On 09/01/2015 08:03 AM, Michael S. Tsirkin wrote:
>>>
>>> Hmm - so do you take back the ack?
>>
>> I have no strong feelings either way, it simply strikes me as misguided to
>> explicitly optimize for something that is listed as a high overhead instruction.
>>
>
> [mst@robin test]$ diff a.c b.c
> 31c31
> < if (__variable_test_bit(1, &addr))
> ---
> > if (__constant_test_bit(1, &addr))
>
> [mst@robin test]$ gcc -Wall -O2 a.c; time ./a.out
>
> real 0m0.532s
> user 0m0.531s
> sys 0m0.000s
> [mst@robin test]$ gcc -Wall -O2 b.c; time ./a.out
>
> real 0m0.517s
> user 0m0.517s
> sys 0m0.000s
>
>
> So __constant_test_bit is faster even though it's using more
> instructions
> $ gcc -Wall -O2 a.c; -objdump -ld ./a.out
>
I think this is well understood. The use of bts/btc in locked
operations is sometimes justified since it reports the bit status back
out, whereas in unlocked operations bts/btc has no benefit except for
code size. bt is a read operation, and is therefore "never/always"
atomic; it cannot be locked because there is no read/write pair to lock.
So it is strictly an issue of code size versus performance.
However, your test is simply faulty:
804843f: 50 push %eax
8048440: 6a 01 push $0x1
8048442: e8 b4 ff ff ff call 80483fb <__variable_test_bit>
You're encapsulating the __variable_test_bit() version into an expensive
function call, whereas the __constant_test_bit() seems to emit code that
is quite frankly completely bonkers insane:
8048444: 8b 45 ec mov -0x14(%ebp),%eax
8048447: 83 e0 1f and $0x1f,%eax
804844a: 89 c1 mov %eax,%ecx
804844c: d3 ea shr %cl,%edx
804844e: 89 d0 mov %edx,%eax
8048450: 83 e0 01 and $0x1,%eax
8048453: 85 c0 test %eax,%eax
8048455: 0f 95 c0 setne %al
8048458: 0f b6 c0 movzbl %al,%eax
804845b: 85 c0 test %eax,%eax
804845d: 74 00 je 804845f <main+0x64>
Observe the sequence and/test/setne/movzbl/test!
-hpa
^ permalink raw reply [flat|nested] 15+ messages in thread
end of thread, other threads:[~2015-09-01 23:49 UTC | newest]
Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-08-30 8:38 [PATCH 1/2] x86/bitops: implement __test_bit Michael S. Tsirkin
2015-08-30 8:38 ` [PATCH 2/2] kvm/x86: use __test_bit Michael S. Tsirkin
2015-08-31 6:05 ` [PATCH 1/2] x86/bitops: implement __test_bit Ingo Molnar
2015-08-31 6:13 ` H. Peter Anvin
2015-08-31 7:56 ` Michael S. Tsirkin
2015-08-31 7:59 ` Ingo Molnar
2015-08-31 8:15 ` yalin wang
2015-08-31 8:19 ` Ingo Molnar
2015-08-31 8:15 ` Ingo Molnar
2015-08-31 11:19 ` Michael S. Tsirkin
2015-09-01 9:24 ` Ingo Molnar
2015-09-01 9:40 ` Michael S. Tsirkin
2015-09-01 11:39 ` Ingo Molnar
2015-09-01 15:03 ` Michael S. Tsirkin
2015-09-01 23:48 ` H. Peter Anvin
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox