public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: "Gary Funck" <gary@intrepid.com>
To: "Linux-Kernel@Vger. Kernel. Org" <linux-kernel@vger.kernel.org>
Subject: RE: 2.6.17-1.2145_FC5 mmap-related soft lockup
Date: Sat, 15 Jul 2006 23:50:26 -0700	[thread overview]
Message-ID: <JCEPIPKHCJGDMPOHDOIGKEKMDFAA.gary@intrepid.com> (raw)
In-Reply-To: <JCEPIPKHCJGDMPOHDOIGGEKJDFAA.gary@intrepid.com>

[-- Attachment #1: Type: text/plain, Size: 1724 bytes --]



> From: Gary Funck
> Sent: Saturday, July 15, 2006 10:07 AM
>
> A test program which allocates about 256M of MAP_ANONYMOUS mmap memory,
> and then spawns 4 processess, where each process i writes to 1/4 of the
> mapped memory, and then reads the memory written by
> the process (i + 1)%4, triggers a soft lockup, when exiting.
> Hardware:
> dual core dual Opteron 275 (Tyan motherboard, 4G physical memory)
> has been rock solid reliable.
>
> BUG: soft lockup detected on CPU#3!

Follow up, the attached test program, when compiled on FC5 with the
2.6.17-1.2145 kernel will cause lost timer ticks, lost RS-232 interrupts,
and often will lead to the soft lockup situation shown below:

BUG: soft lockup detected on CPU#0!

Call Trace: <IRQ> <ffffffff802b2fb5>{softlockup_tick+219}
       <ffffffff8029708e>{update_process_times+66}
<ffffffff8027a3ed>{smp_local_timer_interrupt+35}
       <ffffffff8027aa95>{smp_apic_timer_interrupt+65}
<ffffffff80263acb>{apic_timer_interrupt+135} <EOI>
       <ffffffff8020e578>{__set_page_dirty_nobuffers+0}
<ffffffff8020a7ab>{release_pages+111}
       <ffffffff80267b76>{thread_return+0}
<ffffffff80267bd4>{thread_return+94}
       <ffffffff8020e578>{__set_page_dirty_nobuffers+0}
<ffffffff8020e578>{__set_page_dirty_nobuffers+0}
       <ffffffff8020e09e>{free_pages_and_swap_cache+115}
<ffffffff80207b62>{unmap_vmas+1145}
       <ffffffff8023c7d9>{exit_mmap+120} <ffffffff8023eda8>{mmput+44}
       <ffffffff80215ece>{do_exit+599}
<ffffffff8024cacd>{debug_mutex_init+0}
       <ffffffff80262f01>{tracesys+209}

After compiling the code, A continuous loop like the following, seems to
eventually lead to the soft lockup situation (FC5, x86_64) shown above:

while true; do
  a.out
done


[-- Attachment #2: mmap_soft_lockup_x86_64.c --]
[-- Type: application/octet-stream, Size: 3862 bytes --]

#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/wait.h>

#define PER_PROCESS_ALLOC (256*1024*1024)

#define NUM_SYNC_WORDS (256/32)

typedef struct shared_data_s
  {
    int sync[NUM_SYNC_WORDS];
  } shared_data_t;
typedef shared_data_t *shared_data_p;

shared_data_p shared_data;
int num_cpus;
void *map;
int n;

#if __x86_64__
 
#define LOCK_PREFIX "lock ; "
 
int
atomic_cas (int *ptr, int old, int new)
{
  int prev;
  __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
                       : "=a"(prev)
                       : "q"(new), "m"(*ptr), "0"(old)
                       : "memory");
  return prev == old;
}
#else
# error this test must be run on an x86_64 cpu
#endif

int
get_bit (int *bits, int bitnum)
{
  int *word = bits + (bitnum / 32);
  int bit = (1 << (bitnum % 32));
  return (*word & bit) != 0;
}

void
atomic_set_bit (int *bits, int bitnum)
{
  int *word = bits + (bitnum / 32);
  int bit = (1 << (bitnum % 32));
  int old_val, new_val;
  do
    {
      old_val = *word;
      new_val = old_val | bit;
    }
  while (!atomic_cas (word, old_val, new_val));
}

void
barrier ()
{
  int *sync = shared_data->sync;
  atomic_set_bit (sync, n);  /* set my bit */
  if (n == 0)
    {
      int i, cnt;
      /* spin until all bits are set */
      for (cnt = 0; cnt != num_cpus;)
        for (i = 0, cnt = 0; i < num_cpus; ++i)
	  if (get_bit(sync, i)) ++cnt;
      /* all set, open the barrier. */
      for (i = 0; i < (num_cpus + 31)/32; ++i) sync[i] = 0;
    }
  else
    while (get_bit (sync, n)) /* spin on my bit set */;
}

void
run_test ()
{
  int nxt = (n + 1) % num_cpus;
  char *s;
  char *cp;
  int i;
  int c;
  char *buf;
  barrier ();
  /* write the data for the next process */
  s = map + nxt * PER_PROCESS_ALLOC;
  c = 'A' + nxt % 26;
  memset (s, c, PER_PROCESS_ALLOC-1);
  s[PER_PROCESS_ALLOC-1] = '\0';
  barrier ();
  /* read our data */
  s = map + n * PER_PROCESS_ALLOC;
  c = 'A' + n % 26;
  buf = malloc (PER_PROCESS_ALLOC);
  if (!buf)
    { perror ("malloc"); abort (); }
  memset (buf, c, PER_PROCESS_ALLOC-1);
  buf[PER_PROCESS_ALLOC-1] = '\0';
  if (strcmp(s, buf))
    { fprintf (stderr, "%d: data mismatch\n", n); abort (); }
  barrier ();
  exit (0);
}

int
main (int argc, char *argv[])
{
  char *pgm = argv[0];
  int mask;
  int fd;
  int pid;
  off_t alloc_size;
  int wait_status;
  num_cpus = (int)sysconf(_SC_NPROCESSORS_ONLN);
  if (num_cpus <= 0)
    { perror ("sysconf"); abort (); }
  for (mask = 1; (num_cpus & ~mask); mask <<= 1)
    num_cpus = (num_cpus & ~mask);
  shared_data = mmap((void *)0, 64*1024*1024, PROT_READ|PROT_WRITE,
	             MAP_SHARED | MAP_ANONYMOUS, 0, (off_t)0);
  if (shared_data == MAP_FAILED)
    { perror ("mmap"); abort (); }
  memset (shared_data, '\0', sizeof (shared_data_t));
  alloc_size = num_cpus * PER_PROCESS_ALLOC;
  map = mmap((void *)0, alloc_size, PROT_READ|PROT_WRITE,
	     MAP_SHARED | MAP_ANONYMOUS, 0, (off_t)0);
  if (map == MAP_FAILED)
    { perror ("mmap"); abort (); }
  for (n = 0; n < num_cpus; ++n)
    {
      pid = fork ();
      if (pid == 0)
        run_test (n);  /* no return */
      else if (pid < 0)
        { perror ("fork"); abort (); }
    }
  while ((pid = wait (&wait_status)) > 0)
    {
      if (WIFEXITED (wait_status))
	{
	  int child_exit = WEXITSTATUS (wait_status);
	  if (child_exit)
	    { fprintf (stderr, "non-zero child exit status\n"); abort ();}
	}
      else if (WIFSIGNALED (wait_status))
	{
	  int child_sig = WTERMSIG (wait_status);
	  fprintf (stderr, "child caught signal\n"); 
	  abort ();
	}
    }
  exit (0);
}

      parent reply	other threads:[~2006-07-16  6:50 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-07-15 17:07 2.6.17-1.2145_FC5 mmap-related soft lockup Gary Funck
2006-07-16  5:19 ` Andrew Morton
2006-07-19  2:25   ` Gary Funck
2006-07-19  4:04     ` Andrew Morton
2006-07-16  6:50 ` Gary Funck [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=JCEPIPKHCJGDMPOHDOIGKEKMDFAA.gary@intrepid.com \
    --to=gary@intrepid.com \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox