Resetting a Broadcom in software

All of lore.kernel.org
 help / color / mirror / Atom feed

* Resetting a Broadcom in software
@ 2006-09-06 21:41 Jonathan Day
  2006-09-06 22:32 ` Ralf Baechle
  0 siblings, 1 reply; 5+ messages in thread
From: Jonathan Day @ 2006-09-06 21:41 UTC (permalink / raw)
  To: linux-mips

[-- Attachment #1: Type: text/plain, Size: 1013 bytes --]

Hi,

A co-worker wrote the following test of the Broadcom's
maths abilities and discovered that it reboots some
(but not all) MIPS processors it has been tested on.
It'll reboot the Sentosa, for example, but NOT the
Swarm.

(Apologies for the ugly coding, btw.)

You just make the first file, the ATL_ file gets
included into it. The compiler flags I'm using are:

-march=sb1 -mabi=64 -fomit-frame-pointer -O3 -mips64
-mfused-madd

The program doesn't link to anything and no linker
flags are needed.

This begs three questions:

1) What is happening to cause the CPU to reset? (It's
not a kernel bug, it's an actual CPU reset)

2) What is NOT happening on the Swarm, allowing it to
work fine?

3) Is the problem in the category of "preventable in
hardware", "preventable in the kernel", or
"preventable by slowly roasting those coders who write
like this"?


__________________________________________________
Do You Yahoo!?
Tired of spam?  Yahoo! Mail has the best spam protection around 
http://mail.yahoo.com 

[-- Attachment #2: 2267489636-reboot.c --]
[-- Type: application/octet-stream, Size: 2202 bytes --]

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <string.h>     /* For strerror() */
#include <errno.h>      /* For errno */
#include <unistd.h>     /* For fork() */
#include <pthread.h>    /* For cpu_set_t */
#include <sys/wait.h>   /* For waitpid() */


#define ITERATIONS 1000
#define ARRAY_SIZE 60

#define TYPE double
#define ATL_USERMM ATL_dJIK60x60x48TN48x48x0_a1_b1
#define BETA1
#define MB ARRAY_SIZE
#define NB ARRAY_SIZE
#define KB ARRAY_SIZE
#include "ATL_dmm12x1x12_mips.c"




extern int sched_setaffinity (__pid_t __pid, size_t __cpusetsize,
                              __const cpu_set_t *__cpuset) __THROW;

static __inline__ TYPE lf_random(void)
{
    return ((TYPE)(random() - (RAND_MAX / 2))) / 1000.0;
}

int main(int argc, char **argv)
{
    pid_t pid;
    cpu_set_t cpuset = {{0}};
    register int i;
    TYPE *pMemory;
    register TYPE *A;
    register TYPE *B;
    register TYPE *C;

    pid = fork();
    if (pid == 0)
    {
        /* child */
        cpuset.__bits[0] = 1;
    }
    else
    {
        /* parent */
        cpuset.__bits[0] = 2;
    }

    if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
    {
       fprintf(stdout, "%s(%d): sched_setaffinity returned %d: %s\n", __func__, pid, errno, strerror(errno));
    }

    /* A little bit of FP FCSR magic... */
    i = 0;
    __asm__ volatile ("cfc1\t%0, $28\n\tori\t%0, %0, 0x4\n\tctc1\t%0, $28" : : "r" (i));

    pMemory = (TYPE *)malloc((ARRAY_SIZE * ARRAY_SIZE * 3) * sizeof(TYPE));
    if (pMemory == NULL)
    {
       fprintf(stdout, "%s(%d): malloc failed\n", __func__, pid);
       return 2;
    }

    C = pMemory;
    B = C + (ARRAY_SIZE * ARRAY_SIZE);
    A = B + (ARRAY_SIZE * ARRAY_SIZE);
    for (i = 0; i < (ARRAY_SIZE * ARRAY_SIZE); ++i)
    {
        A[i] = lf_random();
        B[i] = lf_random();
    }

    fprintf(stdout, "%d: start\n", getpid());
    for (i = ITERATIONS; i; --i)
    {
#if 0
        fprintf(stdout, "%d: iteration %d\n", getpid(), i);
#endif
        ATL_USERMM(ARRAY_SIZE, ARRAY_SIZE, ARRAY_SIZE, 1.0, A, ARRAY_SIZE, B, ARRAY_SIZE, 1.0, C, ARRAY_SIZE);
    }

    fprintf(stdout, "%d: done\n", getpid());
    free(pMemory);
    return 0;
}


[-- Attachment #3: 173074032-ATL_dmm12x1x12_mips.c --]
[-- Type: application/octet-stream, Size: 17299 bytes --]

/*
#include "atlas_misc.h"
 */

#define LF_DGEMM_BLOCK      12
#define LF_DGEMM_PREFETCH   1

#define       MIPS
#if   defined(MIPS)
  #define LF_LDC1(fd, p, __vars) \
    __asm__ volatile ("ldc1\t%0,%2(%1)" : "=f"(fd) : "r"(p), "n"((__vars)*sizeof(TYPE)))
  #define LF_SDC1(fs, p, __vars) \
    __asm__ volatile ("sdc1\t%0,%2(%1)" : : "f"(fs), "r"(p), "n"((__vars)*sizeof(TYPE)))
  #define LF_MUL(fd, fs) \
    __asm__ volatile ("mul.d\t%0,%0,%1" : "+f"(fd) : "f"(fs))
  #define LF_MADD(fd, fs1, fs2) \
    __asm__ volatile ("madd.d\t%0,%0,%1,%2" : "+f"(fd) : "f"(fs1), "f"(fs2))
  #if (LF_DGEMM_PREFETCH != 0)
    #define LF_DGEMM_PREF_LOAD(address,vars) \
      __asm__ volatile ("pref 0, %1(%0)" : : "r"(address), "n"((vars)*sizeof(TYPE)))
  #else  /* LF_DGEMM_PREFETCH == 0 */
    #define LF_DGEMM_PREF_LOAD(address,vars)
  #endif
  #if 0
    #define LF_ASM(x)
  #else
    /* Used only for annotating assembly listings */
    #define LF_ASM(x) __asm__(x)
  #endif
#else
  #define LF_LDC1(fd, p, __vars) fd = (p)[__vars]
  #define LF_SDC1(fs, p, __vars) (p)[__vars] = fs
  #define LF_MUL(fd, fs) fd *= fs
  #define LF_MADD(fd, f1, f2) fd += f1 * f2
  #if (LF_DGEMM_PREFETCH != 0)
    #include "atlas_prefetch.h"
    #define LF_DGEMM_PREF_LOAD(address,vars) ATL_pfl1R(&address[vars])
  #else  /* LF_DGEMM_PREFETCH == 0 */
    #define LF_DGEMM_PREF_LOAD(address,vars)
  #endif /* LF_DGEMM_PREFETCH == 0 */
  #define LF_ASM(x)
#endif


#define PREFETCH_ROW(p,vars) \
{ \
    LF_ASM(".globl PREFETCH_ROW"); \
    LF_DGEMM_PREF_LOAD(p, (vars)+0); \
    LF_DGEMM_PREF_LOAD(p, (vars)+4); \
    LF_DGEMM_PREF_LOAD(p, (vars)+8); \
}

#define PREFETCH_COL(p,vars) \
{ \
    LF_ASM(".globl PREFETCH_COL"); \
    LF_DGEMM_PREF_LOAD(p, 0*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 1*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 2*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 3*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 4*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 5*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 6*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 7*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 8*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 9*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 10*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 11*KB+(vars)); \
}

#define STORE_C_BETA0(vars) \
{ \
    LF_ASM(".globl STORE_C_BETA0"); \
    LF_MUL(c00, alpha); \
    LF_MUL(c01, alpha); \
    LF_MUL(c02, alpha); \
    LF_MUL(c03, alpha); \
    LF_MUL(c04, alpha); \
    LF_MUL(c05, alpha); \
    LF_MUL(c06, alpha); \
    LF_MUL(c07, alpha); \
    LF_MUL(c08, alpha); \
    LF_MUL(c09, alpha); \
    LF_MUL(c10, alpha); \
    LF_MUL(c11, alpha); \
    PREFETCH_ROW(pC, LF_DGEMM_PREFETCH * LF_DGEMM_BLOCK); \
    LF_SDC1(c00, pC, (vars)+0); \
    LF_SDC1(c01, pC, (vars)+1); \
    LF_SDC1(c02, pC, (vars)+2); \
    LF_SDC1(c03, pC, (vars)+3); \
    LF_SDC1(c04, pC, (vars)+4); \
    LF_SDC1(c05, pC, (vars)+5); \
    LF_SDC1(c06, pC, (vars)+6); \
    LF_SDC1(c07, pC, (vars)+7); \
    LF_SDC1(c08, pC, (vars)+8); \
    LF_SDC1(c09, pC, (vars)+9); \
    LF_SDC1(c10, pC, (vars)+10); \
    LF_SDC1(c11, pC, (vars)+11); \
}

#define STORE_C_BETA1(vars) \
{ \
    LF_ASM(".globl STORE_C_BETA1"); \
    LF_LDC1(a0, pC, (vars)+0); \
    LF_LDC1(a1, pC, (vars)+1); \
    LF_MADD(a0, c00, alpha); \
    LF_MADD(a1, c01, alpha); \
    LF_LDC1(c00, pC, (vars)+2); \
    LF_LDC1(c01, pC, (vars)+3); \
    LF_MADD(c00, c02, alpha); \
    LF_MADD(c01, c03, alpha); \
    LF_LDC1(c02, pC, (vars)+4); \
    LF_LDC1(c03, pC, (vars)+5); \
    LF_MADD(c02, c04, alpha); \
    LF_MADD(c03, c05, alpha); \
    LF_LDC1(c04, pC, (vars)+6); \
    LF_LDC1(c05, pC, (vars)+7); \
    LF_MADD(c04, c06, alpha); \
    LF_MADD(c05, c07, alpha); \
    LF_LDC1(c06, pC, (vars)+8); \
    LF_LDC1(c07, pC, (vars)+9); \
    LF_MADD(c06, c08, alpha); \
    LF_MADD(c07, c09, alpha); \
    LF_LDC1(c08, pC, (vars)+10); \
    LF_LDC1(c09, pC, (vars)+11); \
    LF_MADD(c08, c10, alpha); \
    LF_MADD(c09, c11, alpha); \
    PREFETCH_ROW(pC, LF_DGEMM_PREFETCH * LF_DGEMM_BLOCK); \
    LF_SDC1(a0, pC, (vars)+0); \
    LF_SDC1(a1, pC, (vars)+1); \
    LF_SDC1(c00, pC, (vars)+2); \
    LF_SDC1(c01, pC, (vars)+3); \
    LF_SDC1(c02, pC, (vars)+4); \
    LF_SDC1(c03, pC, (vars)+5); \
    LF_SDC1(c04, pC, (vars)+6); \
    LF_SDC1(c05, pC, (vars)+7); \
    LF_SDC1(c06, pC, (vars)+8); \
    LF_SDC1(c07, pC, (vars)+9); \
    LF_SDC1(c08, pC, (vars)+10); \
    LF_SDC1(c09, pC, (vars)+11); \
}

#define STORE_C_BETAX(vars) \
{ \
    LF_ASM(".globl STORE_C_BETAX"); \
    LF_LDC1(a0, pC, (vars)+0); \
    LF_LDC1(a1, pC, (vars)+1); \
    LF_MUL(a0, beta); \
    LF_MUL(a1, beta); \
    LF_MADD(a0, c00, alpha); \
    LF_MADD(a1, c01, alpha); \
    LF_LDC1(c00, pC, (vars)+2); \
    LF_LDC1(c01, pC, (vars)+3); \
    LF_MUL(c00, beta); \
    LF_MUL(c01, beta); \
    LF_MADD(c00, c02, alpha); \
    LF_MADD(c01, c03, alpha); \
    LF_LDC1(c02, pC, (vars)+4); \
    LF_LDC1(c03, pC, (vars)+5); \
    LF_MUL(c02, beta); \
    LF_MUL(c03, beta); \
    LF_MADD(c02, c04, alpha); \
    LF_MADD(c03, c05, alpha); \
    LF_LDC1(c04, pC, (vars)+6); \
    LF_LDC1(c05, pC, (vars)+7); \
    LF_MUL(c04, beta); \
    LF_MUL(c05, beta); \
    LF_MADD(c04, c06, alpha); \
    LF_MADD(c05, c07, alpha); \
    LF_LDC1(c06, pC, (vars)+8); \
    LF_LDC1(c07, pC, (vars)+9); \
    LF_MUL(c06, beta); \
    LF_MUL(c07, beta); \
    LF_MADD(c06, c08, alpha); \
    LF_MADD(c07, c09, alpha); \
    LF_LDC1(c08, pC, (vars)+10); \
    LF_LDC1(c09, pC, (vars)+11); \
    LF_MUL(c08, beta); \
    LF_MUL(c09, beta); \
    LF_MADD(c08, c10, alpha); \
    LF_MADD(c09, c11, alpha); \
    PREFETCH_ROW(pC, LF_DGEMM_PREFETCH * LF_DGEMM_BLOCK); \
    LF_SDC1(a0, pC, (vars)+0); \
    LF_SDC1(a1, pC, (vars)+1); \
    LF_SDC1(c00, pC, (vars)+2); \
    LF_SDC1(c01, pC, (vars)+3); \
    LF_SDC1(c02, pC, (vars)+4); \
    LF_SDC1(c03, pC, (vars)+5); \
    LF_SDC1(c04, pC, (vars)+6); \
    LF_SDC1(c05, pC, (vars)+7); \
    LF_SDC1(c06, pC, (vars)+8); \
    LF_SDC1(c07, pC, (vars)+9); \
    LF_SDC1(c08, pC, (vars)+10); \
    LF_SDC1(c09, pC, (vars)+11); \
}

/* Load row of B and prefetch next row of B */
#define LOAD_B() \
{ \
    LF_ASM(".globl LOAD_B"); \
    LF_LDC1(b00, pB, 0); \
    LF_LDC1(b01, pB, 1); \
    LF_LDC1(b02, pB, 2); \
    LF_LDC1(b03, pB, 3); \
    LF_LDC1(b04, pB, 4); \
    LF_LDC1(b05, pB, 5); \
    LF_LDC1(b06, pB, 6); \
    LF_LDC1(b07, pB, 7); \
    LF_LDC1(b08, pB, 8); \
    LF_LDC1(b09, pB, 9); \
    LF_LDC1(b10, pB, 10); \
    LF_LDC1(b11, pB, 11); \
    PREFETCH_ROW(pB, KB); \
    pB += KB; \
}

/* Load column of A and multiply by bx */
#define MULT_C(bx, vars) \
{ \
    LF_ASM(".globl MULT_C"); \
    LF_LDC1(c00, pA, 0*KB+(vars)); \
    LF_LDC1(c01, pA, 1*KB+(vars)); \
    LF_MUL(c00, bx); \
    LF_MUL(c01, bx); \
    LF_LDC1(c02, pA, 2*KB+(vars)); \
    LF_LDC1(c03, pA, 3*KB+(vars)); \
    LF_MUL(c02, bx); \
    LF_MUL(c03, bx); \
    LF_LDC1(c04, pA, 4*KB+(vars)); \
    LF_LDC1(c05, pA, 5*KB+(vars)); \
    LF_MUL(c04, bx); \
    LF_MUL(c05, bx); \
    LF_LDC1(c06, pA, 6*KB+(vars)); \
    LF_LDC1(c07, pA, 7*KB+(vars)); \
    LF_MUL(c06, bx); \
    LF_MUL(c07, bx); \
    LF_LDC1(c08, pA, 8*KB+(vars)); \
    LF_LDC1(c09, pA, 9*KB+(vars)); \
    LF_MUL(c08, bx); \
    LF_MUL(c09, bx); \
    LF_LDC1(c10, pA, 10*KB+(vars)); \
    LF_LDC1(c11, pA, 11*KB+(vars)); \
    LF_MUL(c10, bx); \
    LF_MUL(c11, bx); \
}

/* Load column of A, multiply by bx and add to row of C */
#define MADD_A(bx, vars) \
{ \
    LF_ASM(".globl MADD_A"); \
    LF_LDC1(a0, pA, 0*KB+(vars)); \
    LF_LDC1(a1, pA, 1*KB+(vars)); \
    LF_MADD(c00, a0, bx); \
    LF_MADD(c01, a1, bx); \
    LF_LDC1(a0, pA, 2*KB+(vars)); \
    LF_LDC1(a1, pA, 3*KB+(vars)); \
    LF_MADD(c02, a0, bx); \
    LF_MADD(c03, a1, bx); \
    LF_LDC1(a0, pA, 4*KB+(vars)); \
    LF_LDC1(a1, pA, 5*KB+(vars)); \
    LF_MADD(c04, a0, bx); \
    LF_MADD(c05, a1, bx); \
    LF_LDC1(a0, pA, 6*KB+(vars)); \
    LF_LDC1(a1, pA, 7*KB+(vars)); \
    LF_MADD(c06, a0, bx); \
    LF_MADD(c07, a1, bx); \
    LF_LDC1(a0, pA, 8*KB+(vars)); \
    LF_LDC1(a1, pA, 9*KB+(vars)); \
    LF_MADD(c08, a0, bx); \
    LF_MADD(c09, a1, bx); \
    LF_LDC1(a0, pA, 10*KB+(vars)); \
    LF_LDC1(a1, pA, 11*KB+(vars)); \
    LF_MADD(c10, a0, bx); \
    LF_MADD(c11, a1, bx); \
}

/* Same as MADD_A but with prefetch of next column of A */
#define PMAD_A(bx, vars) \
{ \
    LF_ASM(".globl PMAD_A"); \
    LF_LDC1(a0, pA, 0*KB+(vars)); \
    LF_LDC1(a1, pA, 1*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+0)*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+1)*KB+(vars)); \
    LF_MADD(c00, a0, bx); \
    LF_MADD(c01, a1, bx); \
    LF_LDC1(a0, pA, 2*KB+(vars)); \
    LF_LDC1(a1, pA, 3*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+2)*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+3)*KB+(vars)); \
    LF_MADD(c02, a0, bx); \
    LF_MADD(c03, a1, bx); \
    LF_LDC1(a0, pA, 4*KB+(vars)); \
    LF_LDC1(a1, pA, 5*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+4)*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+5)*KB+(vars)); \
    LF_MADD(c04, a0, bx); \
    LF_MADD(c05, a1, bx); \
    LF_LDC1(a0, pA, 6*KB+(vars)); \
    LF_LDC1(a1, pA, 7*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+6)*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+7)*KB+(vars)); \
    LF_MADD(c06, a0, bx); \
    LF_MADD(c07, a1, bx); \
    LF_LDC1(a0, pA, 8*KB+(vars)); \
    LF_LDC1(a1, pA, 9*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+8)*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+9)*KB+(vars)); \
    LF_MADD(c08, a0, bx); \
    LF_MADD(c09, a1, bx); \
    LF_LDC1(a0, pA, 10*KB+(vars)); \
    LF_LDC1(a1, pA, 11*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+10)*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+11)*KB+(vars)); \
    LF_MADD(c10, a0, bx); \
    LF_MADD(c11, a1, bx); \
}



/*
 *  NOTES
 *      These functions assume the data set fits in L1 cache,
 *      therefore each variable need only be prefetched once.
 *
 *  OUTPUT
 *      C += alpha * A(T) * B
 */

#define pB  B
#define pC  C

/*  1 FP registers */
/*  4 GP registers */
#if   defined(BETA0)
static void first_beta0(const TYPE alpha, const TYPE *A, const TYPE *B, TYPE *C, const int ldc_M, const TYPE *pEndB)
{
    /* 26 FP registers */
    register TYPE           a0,a1;
    register TYPE           b00,b01,b02,b03,b04,b05,b06,b07,b08,b09,b10,b11;
    register TYPE           c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,c10,c11;

    /*  3 GP registers */
    register const TYPE *   pA;
    register const TYPE *   pEndC;

    /* Prefetch first row of B */
    PREFETCH_ROW(pB, 0);

    /* Prefetch first block of A */
    pA = A;
    PREFETCH_COL(pA,  0);
    PREFETCH_COL(pA,  4);
    PREFETCH_COL(pA,  8);

    /* Prefetch first row of C */
    PREFETCH_ROW(pC, 0);

    /* for j = 0 */
    {
        pEndC = &pC[MB];
        LOAD_B();
        while (pC < pEndC) /* for i = 0..M step 4 */
        {
            MULT_C(b00,  0);
            MADD_A(b01,  1);
            MADD_A(b02,  2);
            PMAD_A(b03,  3);
            MADD_A(b04,  4);
            MADD_A(b05,  5);
            MADD_A(b06,  6);
            PMAD_A(b07,  7);
            MADD_A(b08,  8);
            MADD_A(b09,  9);
            MADD_A(b10, 10);
            PMAD_A(b11, 11);
            pA += LF_DGEMM_BLOCK * KB;
            STORE_C_BETA0(0);
            pC += LF_DGEMM_BLOCK;
        } /* for i */
        pC += ldc_M; /* ldc - M */
    }

    while (pB < pEndB) /* for j = 1..N */
    {
        pEndC = &pC[MB];
        pA = A;
        LOAD_B()
        while (pC < pEndC) /* for i = 0..M step 4 */
        {
            MULT_C(b00,  0);
            MADD_A(b01,  1);
            MADD_A(b02,  2);
            MADD_A(b03,  3);
            MADD_A(b04,  4);
            MADD_A(b05,  5);
            MADD_A(b06,  6);
            MADD_A(b07,  7);
            MADD_A(b08,  8);
            MADD_A(b09,  9);
            MADD_A(b10, 10);
            MADD_A(b11, 11);
            pA += LF_DGEMM_BLOCK * KB;
            STORE_C_BETA0(0);
            pC += LF_DGEMM_BLOCK;
        } /* for i */
        pC += ldc_M; /* ldc - M */
    } /* for j */
} /* first_beta0() */
#endif /* BETA0 */

/*  2 FP registers */
/*  4 GP registers */
#if   defined(BETAX)
static void first_betax(const TYPE alpha, const TYPE beta, const TYPE *A, const TYPE *B, TYPE *C, const int ldc_M, const TYPE *pEndB)
{
    /* 26 FP registers */
    register TYPE           a0,a1;
    register TYPE           b00,b01,b02,b03,b04,b05,b06,b07,b08,b09,b10,b11;
    register TYPE           c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,c10,c11;

    /*  3 GP registers */
    register const TYPE *   pA;
    register const TYPE *   pEndC;

    /* Prefetch first row of B */
    PREFETCH_ROW(pB, 0);

    /* Prefetch first block of A */
    pA = A;
    PREFETCH_COL(pA,  0);
    PREFETCH_COL(pA,  4);
    PREFETCH_COL(pA,  8);

    /* Prefetch first row of C */
    PREFETCH_ROW(pC, 0);

    /* for j = 0 */
    {
        pEndC = &pC[MB];
        LOAD_B();
        while (pC < pEndC) /* for i = 0..M step 4 */
        {
            MULT_C(b00,  0);
            MADD_A(b01,  1);
            MADD_A(b02,  2);
            PMAD_A(b03,  3);
            MADD_A(b04,  4);
            MADD_A(b05,  5);
            MADD_A(b06,  6);
            PMAD_A(b07,  7);
            MADD_A(b08,  8);
            MADD_A(b09,  9);
            MADD_A(b10, 10);
            PMAD_A(b11, 11);
            pA += LF_DGEMM_BLOCK * KB;
            STORE_C_BETAX(0);
            pC += LF_DGEMM_BLOCK;
        } /* for i */
        pC += ldc_M; /* ldc - M */
    }

    while (pB < pEndB) /* for j = 1..N */
    {
        pEndC = &pC[MB];
        pA = A;
        LOAD_B()
        while (pC < pEndC) /* for i = 0..M step 4 */
        {
            MULT_C(b00,  0);
            MADD_A(b01,  1);
            MADD_A(b02,  2);
            MADD_A(b03,  3);
            MADD_A(b04,  4);
            MADD_A(b05,  5);
            MADD_A(b06,  6);
            MADD_A(b07,  7);
            MADD_A(b08,  8);
            MADD_A(b09,  9);
            MADD_A(b10, 10);
            MADD_A(b11, 11);
            pA += LF_DGEMM_BLOCK * KB;
            STORE_C_BETAX(0);
            pC += LF_DGEMM_BLOCK;
        } /* for i */
        pC += ldc_M; /* ldc - M */
    } /* for j */
} /* first_betax() */
#endif /* BETAX */

/*  1 FP registers */
/*  4 GP registers */
static void next(const TYPE alpha, const TYPE *A, const TYPE *B, TYPE *C, const int ldc_M, const TYPE *pEndB)
{
    /* 26 FP registers */
    register TYPE           a0,a1;
    register TYPE           b00,b01,b02,b03,b04,b05,b06,b07,b08,b09,b10,b11;
    register TYPE           c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,c10,c11;

    /*  3 GP registers */
    register const TYPE *   pA;
    register const TYPE *   pEndC;

    /* Prefetch first row of B */
    PREFETCH_ROW(pB, 0);

    /* Prefetch first block of A */
    pA = A;
    PREFETCH_COL(pA,  0);
    PREFETCH_COL(pA,  4);
    PREFETCH_COL(pA,  8);

    /* Prefetch first row of C */
    PREFETCH_ROW(pC, 0);

    /* for j = 0 */
    {
        pEndC = &pC[MB];
        LOAD_B();
        while (pC < pEndC) /* for i = 0..M step 4 */
        {
            MULT_C(b00,  0);
            MADD_A(b01,  1);
            MADD_A(b02,  2);
            PMAD_A(b03,  3);
            MADD_A(b04,  4);
            MADD_A(b05,  5);
            MADD_A(b06,  6);
            PMAD_A(b07,  7);
            MADD_A(b08,  8);
            MADD_A(b09,  9);
            MADD_A(b10, 10);
            PMAD_A(b11, 11);
            pA += LF_DGEMM_BLOCK * KB;
            STORE_C_BETA1(0);
            pC += LF_DGEMM_BLOCK;
        } /* for i */
        pC += ldc_M; /* ldc - M */
    }

    while (pB < pEndB) /* for j = 1..N */
    {
        pEndC = &pC[MB];
        pA = A;
        LOAD_B()
        while (pC < pEndC) /* for i = 0..M step 4 */
        {
            MULT_C(b00,  0);
            MADD_A(b01,  1);
            MADD_A(b02,  2);
            MADD_A(b03,  3);
            MADD_A(b04,  4);
            MADD_A(b05,  5);
            MADD_A(b06,  6);
            MADD_A(b07,  7);
            MADD_A(b08,  8);
            MADD_A(b09,  9);
            MADD_A(b10, 10);
            MADD_A(b11, 11);
            pA += LF_DGEMM_BLOCK * KB;
            STORE_C_BETA1(0);
            pC += LF_DGEMM_BLOCK;
        } /* for i */
        pC += ldc_M; /* ldc - M */
    } /* for j */
} /* next() */



void ATL_USERMM(const int M, const int N, const int K,
                const TYPE alpha,
                const TYPE *A, const int lda,
                const TYPE *B, const int ldb,
                const TYPE beta,
                      TYPE *C, const int ldc)
{
    register int ldc_M = ldc - M;
    register const TYPE *pEndB = &B[KB * N];
    register int k;

#if   defined(BETA0)
    first_beta0(alpha, A, B, C, ldc_M, pEndB);
#elif defined(BETA1)
    next(alpha, A, B, C, ldc_M, pEndB);
#else
    first_betax(alpha, beta, A, B, C, ldc_M, pEndB);
#endif

    for (k = LF_DGEMM_BLOCK; k < K; k += LF_DGEMM_BLOCK)
    {
        next(alpha, &A[k], &B[k], C, ldc_M, pEndB);
    }
}


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Resetting a Broadcom in software
  2006-09-06 21:41 Resetting a Broadcom in software Jonathan Day
@ 2006-09-06 22:32 ` Ralf Baechle
  2006-09-06 22:59   ` Jonathan Day
  0 siblings, 1 reply; 5+ messages in thread
From: Ralf Baechle @ 2006-09-06 22:32 UTC (permalink / raw)
  To: Jonathan Day; +Cc: linux-mips

On Wed, Sep 06, 2006 at 02:41:36PM -0700, Jonathan Day wrote:
> Date:	Wed, 6 Sep 2006 14:41:36 -0700 (PDT)
> From:	Jonathan Day <imipak@yahoo.com>
> Subject: Resetting a Broadcom in software
> To:	linux-mips@linux-mips.org
> Content-Type: multipart/mixed; boundary="0-1897457798-1157578896=:54718"
> 
> Hi,
> 
> A co-worker wrote the following test of the Broadcom's
> maths abilities and discovered that it reboots some
> (but not all) MIPS processors it has been tested on.
> It'll reboot the Sentosa, for example, but NOT the
> Swarm.
> 
> (Apologies for the ugly coding, btw.)
> 
> You just make the first file, the ATL_ file gets
> included into it. The compiler flags I'm using are:
> 
> -march=sb1 -mabi=64 -fomit-frame-pointer -O3 -mips64
> -mfused-madd
> 
> The program doesn't link to anything and no linker
> flags are needed.
> 
> This begs three questions:
> 
> 1) What is happening to cause the CPU to reset? (It's
> not a kernel bug, it's an actual CPU reset)
> 
> 2) What is NOT happening on the Swarm, allowing it to
> work fine?
> 
> 3) Is the problem in the category of "preventable in
> hardware", "preventable in the kernel", or
> "preventable by slowly roasting those coders who write
> like this"?

This is not a problem I know of but given your description it sounds very
much like a hardware issue.  Can you find about the exact versions of the
1250 on the various board?  With the FPU being on chip I would expect
some correlation between the chip revision and this issue.

  Ralf

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Resetting a Broadcom in software
  2006-09-06 22:32 ` Ralf Baechle
@ 2006-09-06 22:59   ` Jonathan Day
  2006-09-07  1:03     ` Ralf Baechle
  0 siblings, 1 reply; 5+ messages in thread
From: Jonathan Day @ 2006-09-06 22:59 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: linux-mips

The Sentosa uses a dual-core Broadcom 1250 processor
with an SB1 version 0.2 core. The board is BCM91250E
Revision 1.

The Swarm also uses a Broadcom 1250 with an SB1
version 0.2 core, but the board is a BCM91250A.

Most of the difference seems to be in the motherboard,
rather than the CPU, but I couldn't tell you what the
difference is between an E and an A, and why the A
seems better-behaved.

--- Ralf Baechle <ralf@linux-mips.org> wrote:
> This is not a problem I know of but given your
> description it sounds very
> much like a hardware issue.  Can you find about the
> exact versions of the
> 1250 on the various board?  With the FPU being on
> chip I would expect
> some correlation between the chip revision and this
> issue.

__________________________________________________
Do You Yahoo!?
Tired of spam?  Yahoo! Mail has the best spam protection around 
http://mail.yahoo.com 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Resetting a Broadcom in software
  2006-09-06 22:59   ` Jonathan Day
@ 2006-09-07  1:03     ` Ralf Baechle
  2006-09-07 18:28       ` Jonathan Day
  0 siblings, 1 reply; 5+ messages in thread
From: Ralf Baechle @ 2006-09-07  1:03 UTC (permalink / raw)
  To: Jonathan Day; +Cc: linux-mips

On Wed, Sep 06, 2006 at 03:59:39PM -0700, Jonathan Day wrote:

> The Sentosa uses a dual-core Broadcom 1250 processor
> with an SB1 version 0.2 core. The board is BCM91250E
> Revision 1.
> 
> The Swarm also uses a Broadcom 1250 with an SB1
> version 0.2 core, but the board is a BCM91250A.
> 
> Most of the difference seems to be in the motherboard,
> rather than the CPU, but I couldn't tell you what the
> difference is between an E and an A, and why the A
> seems better-behaved.

There are sub-types to pass 2 but I don't know how to identify those.
Probably by the content of the wafer id register or something like that.

  Ralf

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: Resetting a Broadcom in software
  2006-09-07  1:03     ` Ralf Baechle
@ 2006-09-07 18:28       ` Jonathan Day
  0 siblings, 0 replies; 5+ messages in thread
From: Jonathan Day @ 2006-09-07 18:28 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: linux-mips

Ok, I have the information...

--- Ralf Baechle <ralf@linux-mips.org> wrote:
> There are sub-types to pass 2 but I don't know how
> to identify those.
> Probably by the content of the wafer id register or
> something like that.

Sentosa (which resets on running the program):

Wafer ID: 92cee019 Lot 9395 Wafer 23
Mfg Test: Bin A
CPU: 1040102

Linux' /proc/cpuinfo says a little more:

cpu model               : SiByte SB1 V0.2
(lots of uninteresting stuff)
ASEs installed          : mdmx

Swarm (which does not reset on running the program):
Wafer ID: 5838e019 Lot 5646 Wafer 7
Mfg Test: Bin A
CPU: 1040102

Linux' /proc/cpuinfo:

cpu model               : SiByte SB1 V0.2 FPU 0.2
(more boring stuff)
ASEs installed          : mdmx mips3d

On an aside, can anyone suggest some good values for
Linux' "machine selection" kernel config menu? Well, I
guess it's not really an aside as it's just occurred
to me that the difference in wafer may require
tweaking beyond just setting the system type. Also, if
anyone knows of "must set" options elsewhere, I'd
appreciate knowing.

I know some:

1. Pages are 4K, unless the big page patch is appplied
(try saying that three times quickly).
2. The Broadcom tech docs don't document the presence
of multi-threading in the CPU, so I'm assuming that
has to be off.
3. Most of the profiling options seem to barf in ways
that can only be described as spectacular.
4. I don't know which debug options are needed or not
needed, but certain apparently random permutations
result in a working kernel, others will cause it to
explode violently on bootup.

Some third-party patches work... ...when the
maintainer keeps them up-to-date. Mingo's -rt patches
go in clean and seem to run fine, for example, but I'm
cautious applying 2.6.17 patches to a 2.6.18-rc6
kernel (the current MIPSified Linux kernel in git).

Unfortunately, a lot of the really really good 3rd
party patches are for Intel processors only, and I'm
reluctant to keep a port in sync, partly for reasons
of time but also because I'm not convinced I
understand the mechanisms used by the truly
exceptional stuff well enough to implement on a
platform I'm still figuring out some of the
characteristics of.

Jonathan

__________________________________________________
Do You Yahoo!?
Tired of spam?  Yahoo! Mail has the best spam protection around 
http://mail.yahoo.com 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2006-09-07 18:29 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-09-06 21:41 Resetting a Broadcom in software Jonathan Day
2006-09-06 22:32 ` Ralf Baechle
2006-09-06 22:59   ` Jonathan Day
2006-09-07  1:03     ` Ralf Baechle
2006-09-07 18:28       ` Jonathan Day

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.