From mboxrd@z Thu Jan 1 00:00:00 1970 From: Andrew Cooper Subject: ffs vs __builtin_ffs Date: Thu, 31 Oct 2013 10:53:46 +0000 Message-ID: <527236BA.3010102@citrix.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: Xen-devel List , Keir Fraser , Jan Beulich , Tim Deegan List-Id: xen-devel@lists.xenproject.org Hello, When debugging an issue, I noticed that the use of ffs() from bitopts.h is leading to particularly poor code. (See below for the example, I suspect there is something causing particularly bad behaviour for an optimisation step) The bitop functions are deliberately designed to be compatible with their libc and compiler builtin variants, and replacing it leads to 0x60 bytes disappearing from the .text section even with the small handful of uses in the x86 tree. This is because the compiler can optimise far more around its builtin than our rigid inline assembly. Would it be acceptable to provide a patch which does a straight replace of the static inline function with a define? __builtin_ffs is supported in all compilers we support, but there does not appear to any sane way to detect the presence of the builtin. ~Andrew Example (from my upcoming changes to the HPET code): Code: static uint32_t free_channels = (1U << num_hpets_used) - 1; # pseudocode, for brevity static struct hpet_event_channel * noinline hpet_get_free_channel(void) { unsigned ch, tries; for ( tries = num_hpets_used; tries; --tries ) { if ( (ch = ffs(free_channels)) == 0 ) break; --ch; ASSERT(ch < num_hpets_used); if ( test_and_clear_bit(ch, &free_channels) ) return &hpet_events[ch]; } return NULL; } With regular ffs: ffff82d08019e150 : ffff82d08019e150: 55 push %rbp ffff82d08019e151: 48 89 e5 mov %rsp,%rbp ffff82d08019e154: 8b 15 9e 53 0d 00 mov 0xd539e(%rip),%edx # ffff82d0802734f8 ffff82d08019e15a: 85 d2 test %edx,%edx ffff82d08019e15c: 74 7d je ffff82d08019e1db ffff82d08019e15e: 8b 05 6c 82 17 00 mov 0x17826c(%rip),%eax # ffff82d0803163d0 ffff82d08019e164: 48 0f bc c0 bsf %rax,%rax ffff82d08019e168: 75 07 jne ffff82d08019e171 ffff82d08019e16a: 48 c7 c0 ff ff ff ff mov $0xffffffffffffffff,%rax ffff82d08019e171: 83 c0 01 add $0x1,%eax ffff82d08019e174: 74 6c je ffff82d08019e1e2 ffff82d08019e176: 83 e8 01 sub $0x1,%eax ffff82d08019e179: 39 c2 cmp %eax,%edx ffff82d08019e17b: 76 33 jbe ffff82d08019e1b0 ffff82d08019e17d: f0 0f b3 05 4b 82 17 lock btr %eax,0x17824b(%rip) # ffff82d0803163d0 ffff82d08019e184: 00 ffff82d08019e185: 19 c9 sbb %ecx,%ecx ffff82d08019e187: 85 c9 test %ecx,%ecx ffff82d08019e189: 74 44 je ffff82d08019e1cf ffff82d08019e18b: eb 33 jmp ffff82d08019e1c0 ffff82d08019e18d: 8b 05 3d 82 17 00 mov 0x17823d(%rip),%eax # ffff82d0803163d0 ffff82d08019e193: 48 0f bc c0 bsf %rax,%rax ffff82d08019e197: 75 07 jne ffff82d08019e1a0 ffff82d08019e199: 48 c7 c0 ff ff ff ff mov $0xffffffffffffffff,%rax ffff82d08019e1a0: 83 c0 01 add $0x1,%eax ffff82d08019e1a3: 74 44 je ffff82d08019e1e9 ffff82d08019e1a5: 83 e8 01 sub $0x1,%eax ffff82d08019e1a8: 3b 05 4a 53 0d 00 cmp 0xd534a(%rip),%eax # ffff82d0802734f8 ffff82d08019e1ae: 72 02 jb ffff82d08019e1b2 ffff82d08019e1b0: 0f 0b ud2 ffff82d08019e1b2: f0 0f b3 05 16 82 17 lock btr %eax,0x178216(%rip) # ffff82d0803163d0 ffff82d08019e1b9: 00 ffff82d08019e1ba: 19 c9 sbb %ecx,%ecx ffff82d08019e1bc: 85 c9 test %ecx,%ecx ffff82d08019e1be: 74 0f je ffff82d08019e1cf ffff82d08019e1c0: 89 c0 mov %eax,%eax ffff82d08019e1c2: 48 c1 e0 07 shl $0x7,%rax ffff82d08019e1c6: 48 03 05 33 53 0d 00 add 0xd5333(%rip),%rax # ffff82d080273500 ffff82d08019e1cd: eb 1f jmp ffff82d08019e1ee ffff82d08019e1cf: 83 ea 01 sub $0x1,%edx ffff82d08019e1d2: 75 b9 jne ffff82d08019e18d ffff82d08019e1d4: b8 00 00 00 00 mov $0x0,%eax ffff82d08019e1d9: eb 13 jmp ffff82d08019e1ee ffff82d08019e1db: b8 00 00 00 00 mov $0x0,%eax ffff82d08019e1e0: eb 0c jmp ffff82d08019e1ee ffff82d08019e1e2: b8 00 00 00 00 mov $0x0,%eax ffff82d08019e1e7: eb 05 jmp ffff82d08019e1ee ffff82d08019e1e9: b8 00 00 00 00 mov $0x0,%eax ffff82d08019e1ee: 5d pop %rbp ffff82d08019e1ef: c3 retq With __builtin_ffs: ffff82d08019e4a5 : ffff82d08019e4a5: 55 push %rbp ffff82d08019e4a6: 48 89 e5 mov %rsp,%rbp ffff82d08019e4a9: 8b 15 c9 4f 0d 00 mov 0xd4fc9(%rip),%edx # ffff82d080273478 ffff82d08019e4af: 85 d2 test %edx,%edx ffff82d08019e4b1: 74 4a je ffff82d08019e4fd ffff82d08019e4b3: be ff ff ff ff mov $0xffffffff,%esi ffff82d08019e4b8: 0f bc 05 11 7f 17 00 bsf 0x177f11(%rip),%eax # ffff82d0803163d0 ffff82d08019e4bf: 0f 44 c6 cmove %esi,%eax ffff82d08019e4c2: 83 c0 01 add $0x1,%eax ffff82d08019e4c5: 74 3d je ffff82d08019e504 ffff82d08019e4c7: 83 e8 01 sub $0x1,%eax ffff82d08019e4ca: 3b 05 a8 4f 0d 00 cmp 0xd4fa8(%rip),%eax # ffff82d080273478 ffff82d08019e4d0: 72 02 jb ffff82d08019e4d4 ffff82d08019e4d2: 0f 0b ud2 ffff82d08019e4d4: f0 0f b3 05 f4 7e 17 lock btr %eax,0x177ef4(%rip) # ffff82d0803163d0 ffff82d08019e4db: 00 ffff82d08019e4dc: 19 c9 sbb %ecx,%ecx ffff82d08019e4de: 85 c9 test %ecx,%ecx ffff82d08019e4e0: 74 0f je ffff82d08019e4f1 ffff82d08019e4e2: 89 c0 mov %eax,%eax ffff82d08019e4e4: 48 c1 e0 07 shl $0x7,%rax ffff82d08019e4e8: 48 03 05 91 4f 0d 00 add 0xd4f91(%rip),%rax # ffff82d080273480 ffff82d08019e4ef: eb 18 jmp ffff82d08019e509 ffff82d08019e4f1: 83 ea 01 sub $0x1,%edx ffff82d08019e4f4: 75 c2 jne ffff82d08019e4b8 ffff82d08019e4f6: b8 00 00 00 00 mov $0x0,%eax ffff82d08019e4fb: eb 0c jmp ffff82d08019e509 ffff82d08019e4fd: b8 00 00 00 00 mov $0x0,%eax ffff82d08019e502: eb 05 jmp ffff82d08019e509 ffff82d08019e504: b8 00 00 00 00 mov $0x0,%eax ffff82d08019e509: 5d pop %rbp ffff82d08019e50a: c3 retq