linux-alpha.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Richard Henderson <rth7680@gmail.com>
To: Aurelien Jarno <aurelien@aurel32.net>
Cc: Matt Turner <mattst88@gmail.com>,
	Carlos O'Donell <carlos@systemhalted.org>,
	debian-alpha@lists.debian.org, debian-glibc@lists.debian.org,
	Ivan Kokshaysky <ink@jurassic.park.msu.ru>,
	linux-alpha@vger.kernel.org
Subject: Re: Help on memchr() EGLIBC assembly code
Date: Wed, 29 Jul 2009 17:24:59 -0700	[thread overview]
Message-ID: <4A70E85B.7090806@gmail.com> (raw)
In-Reply-To: <20090726234506.GB13723@volta.aurel32.net>

[-- Attachment #1: Type: text/plain, Size: 1388 bytes --]

On 07/26/2009 04:45 PM, Aurelien Jarno wrote:
> Knowing that $31 could be used for prefetch, I have modified the
> assembly code from memchr.S to use it. It passes all the testsuite.
>

This isn't intended to be a prefetch instruction, it's
meant to be fetching the data for the next word.  I.e.
we've unrolled the loop and there's at least 8 bytes
left in the search.

Note the

         # At least two quads remain to be accessed.

comment.  At that point we're supposed to be more
than 16 bytes away from the end of the input buffer.

Actually, the confusion I see is farther upthread:

> > >>>>> The problem is that the memchr() function on alpha uses 
> prefetch, which
> > >>>>> can cause a page boundary to be crossed, while the standards 
> (POSIX and
> > >>>>> C99) says it should stop when a match is found.

I didn't realize this when I wrote the function.

The entire function should be rewritten, since there's little
point in using a prefetch instruction that close to the load.
Prefetch instructions should be used to move data into the L1
cache, not hide the 3 cycle load delay.  Thus a prefetch, if
used, should be several cache lines ahead, not just a single word.

Perhaps a better solution would be to read words until we get
cacheline aligned, then read the entire line into 8 registers,
prefetch the next line, then process each register one by one.

Try this.


r~

[-- Attachment #2: memchr.c --]
[-- Type: text/x-csrc, Size: 3718 bytes --]

typedef unsigned long word;

#define ldq(X)		(*(const word *)(X))
#define ldq_u(X)	(*(const word *)((X) & -8))

#define cmpbge		__builtin_alpha_cmpbge
#define extql		__builtin_alpha_extql
#define extqh		__builtin_alpha_extqh
#define insbl		__builtin_alpha_insbl
#define insqh		__builtin_alpha_insqh
#define zap		__builtin_alpha_zap

#define unlikely(X)	__builtin_expect ((X), 0)
#define prefetch(X)	__builtin_prefetch ((void *)(X), 0)

#define find(X, Y)	cmpbge (0, (X) ^ (Y))

void *memchr (const void *xs, int xc, word n)
{
  word s = (word)xs;
  word c;
  word current, found;
  word s_align;

  if (unlikely (n == 0))
    return 0;

  current = ldq_u (s);

  /* Replicate low byte of C into all bytes.  */
  {
    word t = insbl (xc, 1);		/* 000000c0 */
    c = (xc & 0xff) | t;		/* 000000cc */
    c = (c << 16) | c;			/* 0000cccc */
    c = (c << 32) | c;			/* cccccccc */
  }

  if (unlikely (n < 9))
    goto only_quad;

  /* Align the source, and decrement the count by the number
     of bytes searched in the first word.  */
  s_align = s & -8;
  n += (s & 7);
  n -= 8;

  /* Deal with misalignment in the first word for the comparison.  */
  {
    word mask = insqh (-1, s);
    found = cmpbge (0, (current ^ c) | mask);
    if (found)
      goto found_it;
  }

  s_align += 8;

  /* If the block is sufficiently large, align to cacheline (minimum 64-bytes),
     prefetch the next line, and read entire cachelines at a time.  */
  if (unlikely (n >= 256))
    {
      prefetch (s_align + 64);
      while (s_align & 63)
	{
	  current = ldq (s_align);
	  found = find (current, c);
	  if (found)
	    goto found_it;
	  s_align += 8;
	  n -= 8;
	}

      while (n > 64)
	{
	  word c0, c1, c2, c3, c4, c5, c6, c7;

	  prefetch (s_align + 64);
	  c0 = ldq (s_align + 0*8);
	  c1 = ldq (s_align + 1*8);
	  c2 = ldq (s_align + 2*8);
	  c3 = ldq (s_align + 3*8);
	  c4 = ldq (s_align + 4*8);
	  c5 = ldq (s_align + 5*8);
	  c6 = ldq (s_align + 6*8);
	  c7 = ldq (s_align + 7*8);

	  found = find (c0, c);
	  if (unlikely (found))
	    goto found_it;
	  s_align += 8;

	  found = find (c1, c);
	  if (unlikely (found))
	    goto found_it;
	  s_align += 8;

	  found = find (c2, c);
	  if (unlikely (found))
	    goto found_it;
	  s_align += 8;

	  found = find (c3, c);
	  if (unlikely (found))
	    goto found_it;
	  s_align += 8;

	  found = find (c4, c);
	  if (unlikely (found))
	    goto found_it;
	  s_align += 8;

	  found = find (c5, c);
	  if (unlikely (found))
	    goto found_it;
	  s_align += 8;

	  found = find (c6, c);
	  if (unlikely (found))
	    goto found_it;
	  s_align += 8;

	  found = find (c7, c);
	  if (unlikely (found))
	    goto found_it;
	  s_align += 8;
	  n -= 64;
	}
    }

  /* Quadword aligned loop.  */
  while (1)
    {
      current = ldq (s_align);
      if (n < 8)
	goto last_quad;
      found = find (current, c);
      if (found)
	goto found_it;

      s_align += 8;
      n -= 8;
    }

 only_quad:
  {
    word end = zap (n, 0x80) - 1;
    word last = extqh (ldq_u (s + end), s);
    word first = extql (current, s);
    current = first | last;

    /* We've read one word and aligned it in the register.  Thus the 
       bit offset in current is relative to S.  */
    s_align = s;
  }

 last_quad:
  {
    word mask = (-1ul >> -n); 
    found = find (current, c) & mask;
    if (found == 0)
      return 0;
  }

 found_it:
  {
    word offset;

#ifdef __alpha_cix__
    offset = __builtin_alpha_cttz (found);
#else
    /* Extract LSB.  */
    found &= -found;

    /* Binary search for the LSB.  */
    offset  = (found & 0x0f ? 0 : 4);
    offset += (found & 0x33 ? 0 : 2);
    offset += (found & 0x55 ? 0 : 1);
#endif

    return (void *)(s_align + offset);
  }
}

  parent reply	other threads:[~2009-07-30  0:24 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20090713173104.GA13883@hall.aurel32.net>
     [not found] ` <119aab440907131124r3fd333d3n967cdde2cf3c2e1b@mail.gmail.com>
     [not found]   ` <20090713211723.GE10110@hall.aurel32.net>
2009-07-13 22:16     ` Help on memchr() EGLIBC assembly code Matt Turner
2009-07-13 22:24       ` Aurelien Jarno
2009-07-15 19:48       ` Richard Henderson
2009-07-19 14:29         ` Aurelien Jarno
2009-07-26 23:45           ` Aurelien Jarno
2009-07-27  9:29             ` Aurelien Jarno
2009-07-30  0:24             ` Richard Henderson [this message]
2009-07-30 16:29               ` Aurelien Jarno
2009-07-31 23:25                 ` Richard Henderson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4A70E85B.7090806@gmail.com \
    --to=rth7680@gmail.com \
    --cc=aurelien@aurel32.net \
    --cc=carlos@systemhalted.org \
    --cc=debian-alpha@lists.debian.org \
    --cc=debian-glibc@lists.debian.org \
    --cc=ink@jurassic.park.msu.ru \
    --cc=linux-alpha@vger.kernel.org \
    --cc=mattst88@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).