All of lore.kernel.org
 help / color / mirror / Atom feed
* Resetting a Broadcom in software
@ 2006-09-06 21:41 Jonathan Day
  2006-09-06 22:32 ` Ralf Baechle
  0 siblings, 1 reply; 5+ messages in thread
From: Jonathan Day @ 2006-09-06 21:41 UTC (permalink / raw)
  To: linux-mips

[-- Attachment #1: Type: text/plain, Size: 1013 bytes --]

Hi,

A co-worker wrote the following test of the Broadcom's
maths abilities and discovered that it reboots some
(but not all) MIPS processors it has been tested on.
It'll reboot the Sentosa, for example, but NOT the
Swarm.

(Apologies for the ugly coding, btw.)

You just make the first file, the ATL_ file gets
included into it. The compiler flags I'm using are:

-march=sb1 -mabi=64 -fomit-frame-pointer -O3 -mips64
-mfused-madd

The program doesn't link to anything and no linker
flags are needed.

This begs three questions:

1) What is happening to cause the CPU to reset? (It's
not a kernel bug, it's an actual CPU reset)

2) What is NOT happening on the Swarm, allowing it to
work fine?

3) Is the problem in the category of "preventable in
hardware", "preventable in the kernel", or
"preventable by slowly roasting those coders who write
like this"?


__________________________________________________
Do You Yahoo!?
Tired of spam?  Yahoo! Mail has the best spam protection around 
http://mail.yahoo.com 

[-- Attachment #2: 2267489636-reboot.c --]
[-- Type: application/octet-stream, Size: 2202 bytes --]

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <string.h>     /* For strerror() */
#include <errno.h>      /* For errno */
#include <unistd.h>     /* For fork() */
#include <pthread.h>    /* For cpu_set_t */
#include <sys/wait.h>   /* For waitpid() */


#define ITERATIONS 1000
#define ARRAY_SIZE 60

#define TYPE double
#define ATL_USERMM ATL_dJIK60x60x48TN48x48x0_a1_b1
#define BETA1
#define MB ARRAY_SIZE
#define NB ARRAY_SIZE
#define KB ARRAY_SIZE
#include "ATL_dmm12x1x12_mips.c"




extern int sched_setaffinity (__pid_t __pid, size_t __cpusetsize,
                              __const cpu_set_t *__cpuset) __THROW;

static __inline__ TYPE lf_random(void)
{
    return ((TYPE)(random() - (RAND_MAX / 2))) / 1000.0;
}

int main(int argc, char **argv)
{
    pid_t pid;
    cpu_set_t cpuset = {{0}};
    register int i;
    TYPE *pMemory;
    register TYPE *A;
    register TYPE *B;
    register TYPE *C;

    pid = fork();
    if (pid == 0)
    {
        /* child */
        cpuset.__bits[0] = 1;
    }
    else
    {
        /* parent */
        cpuset.__bits[0] = 2;
    }

    if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
    {
       fprintf(stdout, "%s(%d): sched_setaffinity returned %d: %s\n", __func__, pid, errno, strerror(errno));
    }

    /* A little bit of FP FCSR magic... */
    i = 0;
    __asm__ volatile ("cfc1\t%0, $28\n\tori\t%0, %0, 0x4\n\tctc1\t%0, $28" : : "r" (i));

    pMemory = (TYPE *)malloc((ARRAY_SIZE * ARRAY_SIZE * 3) * sizeof(TYPE));
    if (pMemory == NULL)
    {
       fprintf(stdout, "%s(%d): malloc failed\n", __func__, pid);
       return 2;
    }

    C = pMemory;
    B = C + (ARRAY_SIZE * ARRAY_SIZE);
    A = B + (ARRAY_SIZE * ARRAY_SIZE);
    for (i = 0; i < (ARRAY_SIZE * ARRAY_SIZE); ++i)
    {
        A[i] = lf_random();
        B[i] = lf_random();
    }

    fprintf(stdout, "%d: start\n", getpid());
    for (i = ITERATIONS; i; --i)
    {
#if 0
        fprintf(stdout, "%d: iteration %d\n", getpid(), i);
#endif
        ATL_USERMM(ARRAY_SIZE, ARRAY_SIZE, ARRAY_SIZE, 1.0, A, ARRAY_SIZE, B, ARRAY_SIZE, 1.0, C, ARRAY_SIZE);
    }

    fprintf(stdout, "%d: done\n", getpid());
    free(pMemory);
    return 0;
}


[-- Attachment #3: 173074032-ATL_dmm12x1x12_mips.c --]
[-- Type: application/octet-stream, Size: 17299 bytes --]

/*
#include "atlas_misc.h"
 */

#define LF_DGEMM_BLOCK      12
#define LF_DGEMM_PREFETCH   1

#define       MIPS
#if   defined(MIPS)
  #define LF_LDC1(fd, p, __vars) \
    __asm__ volatile ("ldc1\t%0,%2(%1)" : "=f"(fd) : "r"(p), "n"((__vars)*sizeof(TYPE)))
  #define LF_SDC1(fs, p, __vars) \
    __asm__ volatile ("sdc1\t%0,%2(%1)" : : "f"(fs), "r"(p), "n"((__vars)*sizeof(TYPE)))
  #define LF_MUL(fd, fs) \
    __asm__ volatile ("mul.d\t%0,%0,%1" : "+f"(fd) : "f"(fs))
  #define LF_MADD(fd, fs1, fs2) \
    __asm__ volatile ("madd.d\t%0,%0,%1,%2" : "+f"(fd) : "f"(fs1), "f"(fs2))
  #if (LF_DGEMM_PREFETCH != 0)
    #define LF_DGEMM_PREF_LOAD(address,vars) \
      __asm__ volatile ("pref 0, %1(%0)" : : "r"(address), "n"((vars)*sizeof(TYPE)))
  #else  /* LF_DGEMM_PREFETCH == 0 */
    #define LF_DGEMM_PREF_LOAD(address,vars)
  #endif
  #if 0
    #define LF_ASM(x)
  #else
    /* Used only for annotating assembly listings */
    #define LF_ASM(x) __asm__(x)
  #endif
#else
  #define LF_LDC1(fd, p, __vars) fd = (p)[__vars]
  #define LF_SDC1(fs, p, __vars) (p)[__vars] = fs
  #define LF_MUL(fd, fs) fd *= fs
  #define LF_MADD(fd, f1, f2) fd += f1 * f2
  #if (LF_DGEMM_PREFETCH != 0)
    #include "atlas_prefetch.h"
    #define LF_DGEMM_PREF_LOAD(address,vars) ATL_pfl1R(&address[vars])
  #else  /* LF_DGEMM_PREFETCH == 0 */
    #define LF_DGEMM_PREF_LOAD(address,vars)
  #endif /* LF_DGEMM_PREFETCH == 0 */
  #define LF_ASM(x)
#endif


#define PREFETCH_ROW(p,vars) \
{ \
    LF_ASM(".globl PREFETCH_ROW"); \
    LF_DGEMM_PREF_LOAD(p, (vars)+0); \
    LF_DGEMM_PREF_LOAD(p, (vars)+4); \
    LF_DGEMM_PREF_LOAD(p, (vars)+8); \
}

#define PREFETCH_COL(p,vars) \
{ \
    LF_ASM(".globl PREFETCH_COL"); \
    LF_DGEMM_PREF_LOAD(p, 0*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 1*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 2*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 3*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 4*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 5*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 6*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 7*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 8*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 9*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 10*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(p, 11*KB+(vars)); \
}

#define STORE_C_BETA0(vars) \
{ \
    LF_ASM(".globl STORE_C_BETA0"); \
    LF_MUL(c00, alpha); \
    LF_MUL(c01, alpha); \
    LF_MUL(c02, alpha); \
    LF_MUL(c03, alpha); \
    LF_MUL(c04, alpha); \
    LF_MUL(c05, alpha); \
    LF_MUL(c06, alpha); \
    LF_MUL(c07, alpha); \
    LF_MUL(c08, alpha); \
    LF_MUL(c09, alpha); \
    LF_MUL(c10, alpha); \
    LF_MUL(c11, alpha); \
    PREFETCH_ROW(pC, LF_DGEMM_PREFETCH * LF_DGEMM_BLOCK); \
    LF_SDC1(c00, pC, (vars)+0); \
    LF_SDC1(c01, pC, (vars)+1); \
    LF_SDC1(c02, pC, (vars)+2); \
    LF_SDC1(c03, pC, (vars)+3); \
    LF_SDC1(c04, pC, (vars)+4); \
    LF_SDC1(c05, pC, (vars)+5); \
    LF_SDC1(c06, pC, (vars)+6); \
    LF_SDC1(c07, pC, (vars)+7); \
    LF_SDC1(c08, pC, (vars)+8); \
    LF_SDC1(c09, pC, (vars)+9); \
    LF_SDC1(c10, pC, (vars)+10); \
    LF_SDC1(c11, pC, (vars)+11); \
}

#define STORE_C_BETA1(vars) \
{ \
    LF_ASM(".globl STORE_C_BETA1"); \
    LF_LDC1(a0, pC, (vars)+0); \
    LF_LDC1(a1, pC, (vars)+1); \
    LF_MADD(a0, c00, alpha); \
    LF_MADD(a1, c01, alpha); \
    LF_LDC1(c00, pC, (vars)+2); \
    LF_LDC1(c01, pC, (vars)+3); \
    LF_MADD(c00, c02, alpha); \
    LF_MADD(c01, c03, alpha); \
    LF_LDC1(c02, pC, (vars)+4); \
    LF_LDC1(c03, pC, (vars)+5); \
    LF_MADD(c02, c04, alpha); \
    LF_MADD(c03, c05, alpha); \
    LF_LDC1(c04, pC, (vars)+6); \
    LF_LDC1(c05, pC, (vars)+7); \
    LF_MADD(c04, c06, alpha); \
    LF_MADD(c05, c07, alpha); \
    LF_LDC1(c06, pC, (vars)+8); \
    LF_LDC1(c07, pC, (vars)+9); \
    LF_MADD(c06, c08, alpha); \
    LF_MADD(c07, c09, alpha); \
    LF_LDC1(c08, pC, (vars)+10); \
    LF_LDC1(c09, pC, (vars)+11); \
    LF_MADD(c08, c10, alpha); \
    LF_MADD(c09, c11, alpha); \
    PREFETCH_ROW(pC, LF_DGEMM_PREFETCH * LF_DGEMM_BLOCK); \
    LF_SDC1(a0, pC, (vars)+0); \
    LF_SDC1(a1, pC, (vars)+1); \
    LF_SDC1(c00, pC, (vars)+2); \
    LF_SDC1(c01, pC, (vars)+3); \
    LF_SDC1(c02, pC, (vars)+4); \
    LF_SDC1(c03, pC, (vars)+5); \
    LF_SDC1(c04, pC, (vars)+6); \
    LF_SDC1(c05, pC, (vars)+7); \
    LF_SDC1(c06, pC, (vars)+8); \
    LF_SDC1(c07, pC, (vars)+9); \
    LF_SDC1(c08, pC, (vars)+10); \
    LF_SDC1(c09, pC, (vars)+11); \
}

#define STORE_C_BETAX(vars) \
{ \
    LF_ASM(".globl STORE_C_BETAX"); \
    LF_LDC1(a0, pC, (vars)+0); \
    LF_LDC1(a1, pC, (vars)+1); \
    LF_MUL(a0, beta); \
    LF_MUL(a1, beta); \
    LF_MADD(a0, c00, alpha); \
    LF_MADD(a1, c01, alpha); \
    LF_LDC1(c00, pC, (vars)+2); \
    LF_LDC1(c01, pC, (vars)+3); \
    LF_MUL(c00, beta); \
    LF_MUL(c01, beta); \
    LF_MADD(c00, c02, alpha); \
    LF_MADD(c01, c03, alpha); \
    LF_LDC1(c02, pC, (vars)+4); \
    LF_LDC1(c03, pC, (vars)+5); \
    LF_MUL(c02, beta); \
    LF_MUL(c03, beta); \
    LF_MADD(c02, c04, alpha); \
    LF_MADD(c03, c05, alpha); \
    LF_LDC1(c04, pC, (vars)+6); \
    LF_LDC1(c05, pC, (vars)+7); \
    LF_MUL(c04, beta); \
    LF_MUL(c05, beta); \
    LF_MADD(c04, c06, alpha); \
    LF_MADD(c05, c07, alpha); \
    LF_LDC1(c06, pC, (vars)+8); \
    LF_LDC1(c07, pC, (vars)+9); \
    LF_MUL(c06, beta); \
    LF_MUL(c07, beta); \
    LF_MADD(c06, c08, alpha); \
    LF_MADD(c07, c09, alpha); \
    LF_LDC1(c08, pC, (vars)+10); \
    LF_LDC1(c09, pC, (vars)+11); \
    LF_MUL(c08, beta); \
    LF_MUL(c09, beta); \
    LF_MADD(c08, c10, alpha); \
    LF_MADD(c09, c11, alpha); \
    PREFETCH_ROW(pC, LF_DGEMM_PREFETCH * LF_DGEMM_BLOCK); \
    LF_SDC1(a0, pC, (vars)+0); \
    LF_SDC1(a1, pC, (vars)+1); \
    LF_SDC1(c00, pC, (vars)+2); \
    LF_SDC1(c01, pC, (vars)+3); \
    LF_SDC1(c02, pC, (vars)+4); \
    LF_SDC1(c03, pC, (vars)+5); \
    LF_SDC1(c04, pC, (vars)+6); \
    LF_SDC1(c05, pC, (vars)+7); \
    LF_SDC1(c06, pC, (vars)+8); \
    LF_SDC1(c07, pC, (vars)+9); \
    LF_SDC1(c08, pC, (vars)+10); \
    LF_SDC1(c09, pC, (vars)+11); \
}

/* Load row of B and prefetch next row of B */
#define LOAD_B() \
{ \
    LF_ASM(".globl LOAD_B"); \
    LF_LDC1(b00, pB, 0); \
    LF_LDC1(b01, pB, 1); \
    LF_LDC1(b02, pB, 2); \
    LF_LDC1(b03, pB, 3); \
    LF_LDC1(b04, pB, 4); \
    LF_LDC1(b05, pB, 5); \
    LF_LDC1(b06, pB, 6); \
    LF_LDC1(b07, pB, 7); \
    LF_LDC1(b08, pB, 8); \
    LF_LDC1(b09, pB, 9); \
    LF_LDC1(b10, pB, 10); \
    LF_LDC1(b11, pB, 11); \
    PREFETCH_ROW(pB, KB); \
    pB += KB; \
}

/* Load column of A and multiply by bx */
#define MULT_C(bx, vars) \
{ \
    LF_ASM(".globl MULT_C"); \
    LF_LDC1(c00, pA, 0*KB+(vars)); \
    LF_LDC1(c01, pA, 1*KB+(vars)); \
    LF_MUL(c00, bx); \
    LF_MUL(c01, bx); \
    LF_LDC1(c02, pA, 2*KB+(vars)); \
    LF_LDC1(c03, pA, 3*KB+(vars)); \
    LF_MUL(c02, bx); \
    LF_MUL(c03, bx); \
    LF_LDC1(c04, pA, 4*KB+(vars)); \
    LF_LDC1(c05, pA, 5*KB+(vars)); \
    LF_MUL(c04, bx); \
    LF_MUL(c05, bx); \
    LF_LDC1(c06, pA, 6*KB+(vars)); \
    LF_LDC1(c07, pA, 7*KB+(vars)); \
    LF_MUL(c06, bx); \
    LF_MUL(c07, bx); \
    LF_LDC1(c08, pA, 8*KB+(vars)); \
    LF_LDC1(c09, pA, 9*KB+(vars)); \
    LF_MUL(c08, bx); \
    LF_MUL(c09, bx); \
    LF_LDC1(c10, pA, 10*KB+(vars)); \
    LF_LDC1(c11, pA, 11*KB+(vars)); \
    LF_MUL(c10, bx); \
    LF_MUL(c11, bx); \
}

/* Load column of A, multiply by bx and add to row of C */
#define MADD_A(bx, vars) \
{ \
    LF_ASM(".globl MADD_A"); \
    LF_LDC1(a0, pA, 0*KB+(vars)); \
    LF_LDC1(a1, pA, 1*KB+(vars)); \
    LF_MADD(c00, a0, bx); \
    LF_MADD(c01, a1, bx); \
    LF_LDC1(a0, pA, 2*KB+(vars)); \
    LF_LDC1(a1, pA, 3*KB+(vars)); \
    LF_MADD(c02, a0, bx); \
    LF_MADD(c03, a1, bx); \
    LF_LDC1(a0, pA, 4*KB+(vars)); \
    LF_LDC1(a1, pA, 5*KB+(vars)); \
    LF_MADD(c04, a0, bx); \
    LF_MADD(c05, a1, bx); \
    LF_LDC1(a0, pA, 6*KB+(vars)); \
    LF_LDC1(a1, pA, 7*KB+(vars)); \
    LF_MADD(c06, a0, bx); \
    LF_MADD(c07, a1, bx); \
    LF_LDC1(a0, pA, 8*KB+(vars)); \
    LF_LDC1(a1, pA, 9*KB+(vars)); \
    LF_MADD(c08, a0, bx); \
    LF_MADD(c09, a1, bx); \
    LF_LDC1(a0, pA, 10*KB+(vars)); \
    LF_LDC1(a1, pA, 11*KB+(vars)); \
    LF_MADD(c10, a0, bx); \
    LF_MADD(c11, a1, bx); \
}

/* Same as MADD_A but with prefetch of next column of A */
#define PMAD_A(bx, vars) \
{ \
    LF_ASM(".globl PMAD_A"); \
    LF_LDC1(a0, pA, 0*KB+(vars)); \
    LF_LDC1(a1, pA, 1*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+0)*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+1)*KB+(vars)); \
    LF_MADD(c00, a0, bx); \
    LF_MADD(c01, a1, bx); \
    LF_LDC1(a0, pA, 2*KB+(vars)); \
    LF_LDC1(a1, pA, 3*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+2)*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+3)*KB+(vars)); \
    LF_MADD(c02, a0, bx); \
    LF_MADD(c03, a1, bx); \
    LF_LDC1(a0, pA, 4*KB+(vars)); \
    LF_LDC1(a1, pA, 5*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+4)*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+5)*KB+(vars)); \
    LF_MADD(c04, a0, bx); \
    LF_MADD(c05, a1, bx); \
    LF_LDC1(a0, pA, 6*KB+(vars)); \
    LF_LDC1(a1, pA, 7*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+6)*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+7)*KB+(vars)); \
    LF_MADD(c06, a0, bx); \
    LF_MADD(c07, a1, bx); \
    LF_LDC1(a0, pA, 8*KB+(vars)); \
    LF_LDC1(a1, pA, 9*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+8)*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+9)*KB+(vars)); \
    LF_MADD(c08, a0, bx); \
    LF_MADD(c09, a1, bx); \
    LF_LDC1(a0, pA, 10*KB+(vars)); \
    LF_LDC1(a1, pA, 11*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+10)*KB+(vars)); \
    LF_DGEMM_PREF_LOAD(pA, (LF_DGEMM_PREFETCH*LF_DGEMM_BLOCK+11)*KB+(vars)); \
    LF_MADD(c10, a0, bx); \
    LF_MADD(c11, a1, bx); \
}



/*
 *  NOTES
 *      These functions assume the data set fits in L1 cache,
 *      therefore each variable need only be prefetched once.
 *
 *  OUTPUT
 *      C += alpha * A(T) * B
 */

#define pB  B
#define pC  C

/*  1 FP registers */
/*  4 GP registers */
#if   defined(BETA0)
static void first_beta0(const TYPE alpha, const TYPE *A, const TYPE *B, TYPE *C, const int ldc_M, const TYPE *pEndB)
{
    /* 26 FP registers */
    register TYPE           a0,a1;
    register TYPE           b00,b01,b02,b03,b04,b05,b06,b07,b08,b09,b10,b11;
    register TYPE           c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,c10,c11;

    /*  3 GP registers */
    register const TYPE *   pA;
    register const TYPE *   pEndC;

    /* Prefetch first row of B */
    PREFETCH_ROW(pB, 0);

    /* Prefetch first block of A */
    pA = A;
    PREFETCH_COL(pA,  0);
    PREFETCH_COL(pA,  4);
    PREFETCH_COL(pA,  8);

    /* Prefetch first row of C */
    PREFETCH_ROW(pC, 0);

    /* for j = 0 */
    {
        pEndC = &pC[MB];
        LOAD_B();
        while (pC < pEndC) /* for i = 0..M step 4 */
        {
            MULT_C(b00,  0);
            MADD_A(b01,  1);
            MADD_A(b02,  2);
            PMAD_A(b03,  3);
            MADD_A(b04,  4);
            MADD_A(b05,  5);
            MADD_A(b06,  6);
            PMAD_A(b07,  7);
            MADD_A(b08,  8);
            MADD_A(b09,  9);
            MADD_A(b10, 10);
            PMAD_A(b11, 11);
            pA += LF_DGEMM_BLOCK * KB;
            STORE_C_BETA0(0);
            pC += LF_DGEMM_BLOCK;
        } /* for i */
        pC += ldc_M; /* ldc - M */
    }

    while (pB < pEndB) /* for j = 1..N */
    {
        pEndC = &pC[MB];
        pA = A;
        LOAD_B()
        while (pC < pEndC) /* for i = 0..M step 4 */
        {
            MULT_C(b00,  0);
            MADD_A(b01,  1);
            MADD_A(b02,  2);
            MADD_A(b03,  3);
            MADD_A(b04,  4);
            MADD_A(b05,  5);
            MADD_A(b06,  6);
            MADD_A(b07,  7);
            MADD_A(b08,  8);
            MADD_A(b09,  9);
            MADD_A(b10, 10);
            MADD_A(b11, 11);
            pA += LF_DGEMM_BLOCK * KB;
            STORE_C_BETA0(0);
            pC += LF_DGEMM_BLOCK;
        } /* for i */
        pC += ldc_M; /* ldc - M */
    } /* for j */
} /* first_beta0() */
#endif /* BETA0 */

/*  2 FP registers */
/*  4 GP registers */
#if   defined(BETAX)
static void first_betax(const TYPE alpha, const TYPE beta, const TYPE *A, const TYPE *B, TYPE *C, const int ldc_M, const TYPE *pEndB)
{
    /* 26 FP registers */
    register TYPE           a0,a1;
    register TYPE           b00,b01,b02,b03,b04,b05,b06,b07,b08,b09,b10,b11;
    register TYPE           c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,c10,c11;

    /*  3 GP registers */
    register const TYPE *   pA;
    register const TYPE *   pEndC;

    /* Prefetch first row of B */
    PREFETCH_ROW(pB, 0);

    /* Prefetch first block of A */
    pA = A;
    PREFETCH_COL(pA,  0);
    PREFETCH_COL(pA,  4);
    PREFETCH_COL(pA,  8);

    /* Prefetch first row of C */
    PREFETCH_ROW(pC, 0);

    /* for j = 0 */
    {
        pEndC = &pC[MB];
        LOAD_B();
        while (pC < pEndC) /* for i = 0..M step 4 */
        {
            MULT_C(b00,  0);
            MADD_A(b01,  1);
            MADD_A(b02,  2);
            PMAD_A(b03,  3);
            MADD_A(b04,  4);
            MADD_A(b05,  5);
            MADD_A(b06,  6);
            PMAD_A(b07,  7);
            MADD_A(b08,  8);
            MADD_A(b09,  9);
            MADD_A(b10, 10);
            PMAD_A(b11, 11);
            pA += LF_DGEMM_BLOCK * KB;
            STORE_C_BETAX(0);
            pC += LF_DGEMM_BLOCK;
        } /* for i */
        pC += ldc_M; /* ldc - M */
    }

    while (pB < pEndB) /* for j = 1..N */
    {
        pEndC = &pC[MB];
        pA = A;
        LOAD_B()
        while (pC < pEndC) /* for i = 0..M step 4 */
        {
            MULT_C(b00,  0);
            MADD_A(b01,  1);
            MADD_A(b02,  2);
            MADD_A(b03,  3);
            MADD_A(b04,  4);
            MADD_A(b05,  5);
            MADD_A(b06,  6);
            MADD_A(b07,  7);
            MADD_A(b08,  8);
            MADD_A(b09,  9);
            MADD_A(b10, 10);
            MADD_A(b11, 11);
            pA += LF_DGEMM_BLOCK * KB;
            STORE_C_BETAX(0);
            pC += LF_DGEMM_BLOCK;
        } /* for i */
        pC += ldc_M; /* ldc - M */
    } /* for j */
} /* first_betax() */
#endif /* BETAX */

/*  1 FP registers */
/*  4 GP registers */
static void next(const TYPE alpha, const TYPE *A, const TYPE *B, TYPE *C, const int ldc_M, const TYPE *pEndB)
{
    /* 26 FP registers */
    register TYPE           a0,a1;
    register TYPE           b00,b01,b02,b03,b04,b05,b06,b07,b08,b09,b10,b11;
    register TYPE           c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,c10,c11;

    /*  3 GP registers */
    register const TYPE *   pA;
    register const TYPE *   pEndC;

    /* Prefetch first row of B */
    PREFETCH_ROW(pB, 0);

    /* Prefetch first block of A */
    pA = A;
    PREFETCH_COL(pA,  0);
    PREFETCH_COL(pA,  4);
    PREFETCH_COL(pA,  8);

    /* Prefetch first row of C */
    PREFETCH_ROW(pC, 0);

    /* for j = 0 */
    {
        pEndC = &pC[MB];
        LOAD_B();
        while (pC < pEndC) /* for i = 0..M step 4 */
        {
            MULT_C(b00,  0);
            MADD_A(b01,  1);
            MADD_A(b02,  2);
            PMAD_A(b03,  3);
            MADD_A(b04,  4);
            MADD_A(b05,  5);
            MADD_A(b06,  6);
            PMAD_A(b07,  7);
            MADD_A(b08,  8);
            MADD_A(b09,  9);
            MADD_A(b10, 10);
            PMAD_A(b11, 11);
            pA += LF_DGEMM_BLOCK * KB;
            STORE_C_BETA1(0);
            pC += LF_DGEMM_BLOCK;
        } /* for i */
        pC += ldc_M; /* ldc - M */
    }

    while (pB < pEndB) /* for j = 1..N */
    {
        pEndC = &pC[MB];
        pA = A;
        LOAD_B()
        while (pC < pEndC) /* for i = 0..M step 4 */
        {
            MULT_C(b00,  0);
            MADD_A(b01,  1);
            MADD_A(b02,  2);
            MADD_A(b03,  3);
            MADD_A(b04,  4);
            MADD_A(b05,  5);
            MADD_A(b06,  6);
            MADD_A(b07,  7);
            MADD_A(b08,  8);
            MADD_A(b09,  9);
            MADD_A(b10, 10);
            MADD_A(b11, 11);
            pA += LF_DGEMM_BLOCK * KB;
            STORE_C_BETA1(0);
            pC += LF_DGEMM_BLOCK;
        } /* for i */
        pC += ldc_M; /* ldc - M */
    } /* for j */
} /* next() */



void ATL_USERMM(const int M, const int N, const int K,
                const TYPE alpha,
                const TYPE *A, const int lda,
                const TYPE *B, const int ldb,
                const TYPE beta,
                      TYPE *C, const int ldc)
{
    register int ldc_M = ldc - M;
    register const TYPE *pEndB = &B[KB * N];
    register int k;

#if   defined(BETA0)
    first_beta0(alpha, A, B, C, ldc_M, pEndB);
#elif defined(BETA1)
    next(alpha, A, B, C, ldc_M, pEndB);
#else
    first_betax(alpha, beta, A, B, C, ldc_M, pEndB);
#endif

    for (k = LF_DGEMM_BLOCK; k < K; k += LF_DGEMM_BLOCK)
    {
        next(alpha, &A[k], &B[k], C, ldc_M, pEndB);
    }
}


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2006-09-07 18:29 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-09-06 21:41 Resetting a Broadcom in software Jonathan Day
2006-09-06 22:32 ` Ralf Baechle
2006-09-06 22:59   ` Jonathan Day
2006-09-07  1:03     ` Ralf Baechle
2006-09-07 18:28       ` Jonathan Day

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.