* [PATCH 1/2] nv50/ir: add fp64 support on G200 (NVA0)
@ 2015-02-23 4:01 Ilia Mirkin
[not found] ` <1424664088-14913-1-git-send-email-imirkin-FrUbXkNCsVf2fBVCVOL8/A@public.gmane.org>
0 siblings, 1 reply; 5+ messages in thread
From: Ilia Mirkin @ 2015-02-23 4:01 UTC (permalink / raw)
To: mesa-dev-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
Untested beyond compiling a few shaders to see if they look like they
might work. nvdisasm agrees with envydis's decoding of these things.
Will definitely get ahold of a G200 to run tests on before pushing this.
.../drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp | 94 ++++++++++++++++++---
.../nouveau/codegen/nv50_ir_lowering_nv50.cpp | 97 +++++++++++++++++++++-
.../nouveau/codegen/nv50_ir_target_nv50.cpp | 2 +-
src/gallium/drivers/nouveau/nv50/nv50_screen.c | 4 +
4 files changed, 185 insertions(+), 12 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index b1e7409..7c6f7da 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -96,9 +96,12 @@ private:
void emitUADD(const Instruction *);
void emitAADD(const Instruction *);
void emitFADD(const Instruction *);
+ void emitDADD(const Instruction *);
void emitIMUL(const Instruction *);
void emitFMUL(const Instruction *);
+ void emitDMUL(const Instruction *);
void emitFMAD(const Instruction *);
+ void emitDMAD(const Instruction *);
void emitIMAD(const Instruction *);
void emitISAD(const Instruction *);
@@ -923,11 +926,13 @@ CodeEmitterNV50::emitMINMAX(const Instruction *i)
assert(0);
break;
}
- code[1] |= i->src(0).mod.abs() << 20;
- code[1] |= i->src(0).mod.neg() << 26;
- code[1] |= i->src(1).mod.abs() << 19;
- code[1] |= i->src(1).mod.neg() << 27;
}
+
+ code[1] |= i->src(0).mod.abs() << 20;
+ code[1] |= i->src(0).mod.neg() << 26;
+ code[1] |= i->src(1).mod.abs() << 19;
+ code[1] |= i->src(1).mod.neg() << 27;
+
emitForm_MAD(i);
}
@@ -963,6 +968,26 @@ CodeEmitterNV50::emitFMAD(const Instruction *i)
}
void
+CodeEmitterNV50::emitDMAD(const Instruction *i)
+{
+ const int neg_mul = i->src(0).mod.neg() ^ i->src(1).mod.neg();
+ const int neg_add = i->src(2).mod.neg();
+
+ assert(i->encSize == 8);
+ assert(!i->saturate);
+
+ code[1] = 0x40000000;
+ code[0] = 0xe0000000;
+
+ code[1] |= neg_mul << 26;
+ code[1] |= neg_add << 27;
+
+ roundMode_MAD(i);
+
+ emitForm_MAD(i);
+}
+
+void
CodeEmitterNV50::emitFADD(const Instruction *i)
{
const int neg0 = i->src(0).mod.neg();
@@ -997,6 +1022,25 @@ CodeEmitterNV50::emitFADD(const Instruction *i)
}
void
+CodeEmitterNV50::emitDADD(const Instruction *i)
+{
+ const int neg0 = i->src(0).mod.neg();
+ const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
+
+ assert(!(i->src(0).mod | i->src(1).mod).abs());
+ assert(!i->saturate);
+ assert(i->encSize == 8);
+
+ code[1] = 0x60000000;
+ code[0] = 0xe0000000;
+
+ emitForm_ADD(i);
+
+ code[1] |= neg0 << 26;
+ code[1] |= neg1 << 27;
+}
+
+void
CodeEmitterNV50::emitUADD(const Instruction *i)
{
const int neg0 = i->src(0).mod.neg();
@@ -1090,6 +1134,25 @@ CodeEmitterNV50::emitFMUL(const Instruction *i)
}
void
+CodeEmitterNV50::emitDMUL(const Instruction *i)
+{
+ const int neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+ assert(!i->saturate);
+ assert(i->encSize == 8);
+
+ code[1] = 0x80000000;
+ code[0] = 0xe0000000;
+
+ if (neg)
+ code[1] |= 0x08000000;
+
+ roundMode_CVT(i->rnd);
+
+ emitForm_MAD(i);
+}
+
+void
CodeEmitterNV50::emitIMAD(const Instruction *i)
{
code[0] = 0x60000000;
@@ -1150,9 +1213,11 @@ CodeEmitterNV50::emitSET(const Instruction *i)
code[0] = 0x30000000;
code[1] = 0x60000000;
- emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
-
switch (i->sType) {
+ case TYPE_F64:
+ code[0] = 0xe0000000;
+ code[1] = 0xe0000000;
+ break;
case TYPE_F32: code[0] |= 0x80000000; break;
case TYPE_S32: code[1] |= 0x0c000000; break;
case TYPE_U32: code[1] |= 0x04000000; break;
@@ -1162,6 +1227,9 @@ CodeEmitterNV50::emitSET(const Instruction *i)
assert(0);
break;
}
+
+ emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
+
if (i->src(0).mod.neg()) code[1] |= 0x04000000;
if (i->src(1).mod.neg()) code[1] |= 0x08000000;
if (i->src(0).mod.abs()) code[1] |= 0x00100000;
@@ -1725,7 +1793,9 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
break;
case OP_ADD:
case OP_SUB:
- if (isFloatType(insn->dType))
+ if (insn->dType == TYPE_F64)
+ emitDADD(insn);
+ else if (isFloatType(insn->dType))
emitFADD(insn);
else if (insn->getDef(0)->reg.file == FILE_ADDRESS)
emitAADD(insn);
@@ -1733,14 +1803,18 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
emitUADD(insn);
break;
case OP_MUL:
- if (isFloatType(insn->dType))
+ if (insn->dType == TYPE_F64)
+ emitDMUL(insn);
+ else if (isFloatType(insn->dType))
emitFMUL(insn);
else
emitIMUL(insn);
break;
case OP_MAD:
case OP_FMA:
- if (isFloatType(insn->dType))
+ if (insn->dType == TYPE_F64)
+ emitDMAD(insn);
+ else if (isFloatType(insn->dType))
emitFMAD(insn);
else
emitIMAD(insn);
@@ -1912,7 +1986,7 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const
{
const Target::OpInfo &info = targ->getOpInfo(i);
- if (info.minEncSize > 4)
+ if (info.minEncSize > 4 || i->dType == TYPE_F64)
return 8;
// check constraints on dst and src operands
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 1ad0860..d5dadc2 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -314,6 +314,7 @@ private:
void handleDIV(Instruction *);
void handleMOD(Instruction *);
void handleMUL(Instruction *);
+ void handleDRCPRSQ(Instruction *);
void handleAddrDef(Instruction *);
inline bool isARL(const Instruction *) const;
@@ -552,6 +553,95 @@ NV50LegalizeSSA::handleMOD(Instruction *mod)
mod->setSrc(1, m);
}
+void
+NV50LegalizeSSA::handleDRCPRSQ(Instruction *i)
+{
+ /* We need to replace this instruction with a sequence that computes the
+ * appropriate function. As a first guess, we use the "quake" style
+ * approximation for RSQ:
+ *
+ * 0x5fe6eb50c7b537a9 - num >> 1
+ *
+ * For RCP, we will then square it.
+ */
+ Value *abs, *guess, *parts[2], *input[2], *shr[4], *pred;
+
+ bld.setPosition(i, false);
+
+ abs = bld.mkOp1v(OP_ABS, TYPE_F64, bld.getSSA(8), i->getSrc(0));
+
+ parts[0] = bld.loadImm(NULL, 0xc7b537a9);
+ parts[1] = bld.loadImm(NULL, 0x5fe6eb50);
+ guess = bld.mkOp2v(OP_MERGE, TYPE_F64, bld.getSSA(8), parts[0], parts[1]);
+
+ bld.mkSplit(input, 4, abs);
+ shr[0] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(4), input[0], bld.mkImm(1));
+ shr[1] = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(4), input[1], bld.mkImm(1));
+
+ // If the bottom bit of the high word was set, set the high bit of the
+ // bottom word.
+ pred = bld.getSSA(1, FILE_FLAGS);
+ bld.mkOp2(OP_AND, TYPE_U32, NULL, input[1], bld.loadImm(NULL, 1))
+ ->setFlagsDef(0, pred);
+ shr[2] = bld.getSSA(4); shr[3] = bld.getSSA(4);
+ bld.mkOp2(OP_OR, TYPE_U32, shr[2], shr[0], bld.loadImm(NULL, 0x80000000))
+ ->setPredicate(CC_S, pred);
+ bld.mkMov(shr[3], shr[0])
+ ->setPredicate(CC_NS, pred);
+ shr[0] = bld.mkOp2v(OP_UNION, TYPE_U32, bld.getSSA(4), shr[2], shr[3]);
+
+ guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8), guess,
+ bld.mkOp2v(OP_MERGE, TYPE_F64, bld.getSSA(8), shr[0], shr[1]));
+
+ if (i->op == OP_RCP) {
+ Value *two = bld.getSSA(8), *neg = bld.getSSA(8), *copy = bld.getSSA(8);
+
+ bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f));
+
+ /* Square the guess first, since it was for RSQ */
+ guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess);
+
+ // RCP: x_{n+1} = 2 * x_n - input * x_n^2
+ guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), abs,
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
+ guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), abs,
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
+
+ // Restore the sign on the output
+ bld.mkSplit(input, 4, i->getSrc(0));
+ bld.mkOp2(OP_AND, TYPE_U32, NULL, input[1], bld.loadImm(NULL, 0x80000000))
+ ->setFlagsDef(0, (pred = bld.getSSA(1, FILE_FLAGS)));
+ bld.mkOp1(OP_NEG, TYPE_F64, neg, guess)
+ ->setPredicate(CC_S, pred);
+ bld.mkMov(copy, guess)
+ ->setPredicate(CC_NS, pred);
+ guess = bld.mkOp2v(OP_UNION, TYPE_U64, bld.getSSA(8), neg, copy);
+ } else {
+ Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8);
+ bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, -0.5f));
+ bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 1.5f));
+
+ half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, abs);
+ // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2)
+ guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
+ bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
+ three_half));
+ guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
+ bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
+ three_half));
+ }
+
+ i->op = OP_MOV;
+ i->setSrc(0, guess);
+}
+
+
bool
NV50LegalizeSSA::visit(BasicBlock *bb)
{
@@ -578,6 +668,11 @@ NV50LegalizeSSA::visit(BasicBlock *bb)
case OP_MUL:
handleMUL(insn);
break;
+ case OP_RCP:
+ case OP_RSQ:
+ if (insn->dType == TYPE_F64)
+ handleDRCPRSQ(insn);
+ break;
default:
break;
}
@@ -1162,7 +1257,7 @@ NV50LoweringPreSSA::handleDIV(Instruction *i)
bool
NV50LoweringPreSSA::handleSQRT(Instruction *i)
{
- Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
+ Instruction *rsq = bld.mkOp1(OP_RSQ, i->dType,
bld.getSSA(), i->getSrc(0));
i->op = OP_MUL;
i->setSrc(1, rsq->getDef(0));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index 178a167..f3d8733 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -388,7 +388,7 @@ TargetNV50::isAccessSupported(DataFile file, DataType ty) const
bool
TargetNV50::isOpSupported(operation op, DataType ty) const
{
- if (ty == TYPE_F64 && chipset < 0xa0)
+ if (ty == TYPE_F64 && chipset != 0xa0)
return false;
switch (op) {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index ed07ba4..4532957 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -237,6 +237,8 @@ static int
nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
enum pipe_shader_cap param)
{
+ struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+
switch (shader) {
case PIPE_SHADER_VERTEX:
case PIPE_SHADER_GEOMETRY:
@@ -287,7 +289,9 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
return MIN2(32, PIPE_MAX_SAMPLERS);
case PIPE_SHADER_CAP_DOUBLES:
+ return dev->chipset == 0xa0;
case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
+ return dev->chipset == 0xa0;
case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
return 0;
default:
--
2.0.5
_______________________________________________
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results
[not found] ` <1424664088-14913-1-git-send-email-imirkin-FrUbXkNCsVf2fBVCVOL8/A@public.gmane.org>
@ 2015-02-23 4:01 ` Ilia Mirkin
2015-02-23 13:24 ` Roland Scheidegger
0 siblings, 1 reply; 5+ messages in thread
From: Ilia Mirkin @ 2015-02-23 4:01 UTC (permalink / raw)
To: mesa-dev-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW,
nouveau-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
Not sure how many steps are needed for the necessary accuracy. Just
doing 2 because that seems like a reasonable number.
.../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 42 ++++++++++++++++++++--
1 file changed, 39 insertions(+), 3 deletions(-)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 87e75e1..9767566 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -77,8 +77,9 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
bld.setPosition(i, false);
// 1. Take the source and it up.
- Value *src[2], *dst[2], *def = i->getDef(0);
- bld.mkSplit(src, 4, i->getSrc(0));
+ Value *input = i->getSrc(0);
+ Value *src[2], *dst[2], *guess, *def = i->getDef(0);
+ bld.mkSplit(src, 4, input);
// 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
dst[0] = bld.loadImm(NULL, 0);
@@ -93,7 +94,42 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
// 4. Recombine the two dst pieces back into the original destination.
bld.setPosition(i, true);
- bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
+ guess = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), dst[0], dst[1]);
+
+ // 5. Perform 2 Newton-Raphson steps
+ if (i->op == OP_RCP) {
+ // RCP: x_{n+1} = 2 * x_n - input * x_n^2
+ Value *two = bld.getSSA(8);
+
+ bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f));
+
+ guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
+ guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
+ } else {
+ // RSQ: x_{n+1} = x_n (1.5 - 0.5 * input * x_n^2)
+ Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8);
+ bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, -0.5f));
+ bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 1.5f));
+
+ half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, input);
+ // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2)
+ guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
+ bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
+ three_half));
+ guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
+ bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
+ bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
+ three_half));
+ }
+
+ bld.mkMov(def, guess);
}
bool
--
2.0.5
_______________________________________________
Nouveau mailing list
Nouveau@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/nouveau
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results
2015-02-23 4:01 ` [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results Ilia Mirkin
@ 2015-02-23 13:24 ` Roland Scheidegger
2015-02-23 15:40 ` Ilia Mirkin
0 siblings, 1 reply; 5+ messages in thread
From: Roland Scheidegger @ 2015-02-23 13:24 UTC (permalink / raw)
To: Ilia Mirkin, mesa-dev, nouveau
Does this give correct results for special floats (0, infs)?
We tried to improve (for single floats) x86 rcp in llvmpipe with
newton-raphson, but unfortunately not being able to give correct results
for these two cases (without even more additional code) meant it got all
disabled in the end (you can still see that code in the driver) since
the problems are at least as bad as those due to bad accuracy...
Roland
Am 23.02.2015 um 05:01 schrieb Ilia Mirkin:
> Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
> ---
>
> Not sure how many steps are needed for the necessary accuracy. Just
> doing 2 because that seems like a reasonable number.
>
> .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 42 ++++++++++++++++++++--
> 1 file changed, 39 insertions(+), 3 deletions(-)
>
> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> index 87e75e1..9767566 100644
> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
> @@ -77,8 +77,9 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
> bld.setPosition(i, false);
>
> // 1. Take the source and it up.
> - Value *src[2], *dst[2], *def = i->getDef(0);
> - bld.mkSplit(src, 4, i->getSrc(0));
> + Value *input = i->getSrc(0);
> + Value *src[2], *dst[2], *guess, *def = i->getDef(0);
> + bld.mkSplit(src, 4, input);
>
> // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
> dst[0] = bld.loadImm(NULL, 0);
> @@ -93,7 +94,42 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
>
> // 4. Recombine the two dst pieces back into the original destination.
> bld.setPosition(i, true);
> - bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
> + guess = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), dst[0], dst[1]);
> +
> + // 5. Perform 2 Newton-Raphson steps
> + if (i->op == OP_RCP) {
> + // RCP: x_{n+1} = 2 * x_n - input * x_n^2
> + Value *two = bld.getSSA(8);
> +
> + bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f));
> +
> + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
> + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
> + } else {
> + // RSQ: x_{n+1} = x_n (1.5 - 0.5 * input * x_n^2)
> + Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8);
> + bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, -0.5f));
> + bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 1.5f));
> +
> + half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, input);
> + // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2)
> + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
> + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
> + three_half));
> + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
> + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
> + three_half));
> + }
> +
> + bld.mkMov(def, guess);
> }
>
> bool
>
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results
2015-02-23 13:24 ` Roland Scheidegger
@ 2015-02-23 15:40 ` Ilia Mirkin
2015-02-23 20:23 ` Ilia Mirkin
0 siblings, 1 reply; 5+ messages in thread
From: Ilia Mirkin @ 2015-02-23 15:40 UTC (permalink / raw)
To: Roland Scheidegger
Cc: mesa-dev@lists.freedesktop.org, nouveau@lists.freedesktop.org
Oh right. I think the NVIDIA blob executes those steps conditionally
based on the upper bits not being 0x7ff (== infinity/nan). I should do
the same thing here. [FWIW I was able to test the nv50 code last night
and that one's a total fail for rcp/rsq... will need to port that over
to my nvc0 and debug there.]
On Mon, Feb 23, 2015 at 8:24 AM, Roland Scheidegger <sroland@vmware.com> wrote:
> Does this give correct results for special floats (0, infs)?
> We tried to improve (for single floats) x86 rcp in llvmpipe with
> newton-raphson, but unfortunately not being able to give correct results
> for these two cases (without even more additional code) meant it got all
> disabled in the end (you can still see that code in the driver) since
> the problems are at least as bad as those due to bad accuracy...
>
> Roland
>
> Am 23.02.2015 um 05:01 schrieb Ilia Mirkin:
>> Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
>> ---
>>
>> Not sure how many steps are needed for the necessary accuracy. Just
>> doing 2 because that seems like a reasonable number.
>>
>> .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 42 ++++++++++++++++++++--
>> 1 file changed, 39 insertions(+), 3 deletions(-)
>>
>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>> index 87e75e1..9767566 100644
>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>> @@ -77,8 +77,9 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
>> bld.setPosition(i, false);
>>
>> // 1. Take the source and it up.
>> - Value *src[2], *dst[2], *def = i->getDef(0);
>> - bld.mkSplit(src, 4, i->getSrc(0));
>> + Value *input = i->getSrc(0);
>> + Value *src[2], *dst[2], *guess, *def = i->getDef(0);
>> + bld.mkSplit(src, 4, input);
>>
>> // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
>> dst[0] = bld.loadImm(NULL, 0);
>> @@ -93,7 +94,42 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
>>
>> // 4. Recombine the two dst pieces back into the original destination.
>> bld.setPosition(i, true);
>> - bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
>> + guess = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), dst[0], dst[1]);
>> +
>> + // 5. Perform 2 Newton-Raphson steps
>> + if (i->op == OP_RCP) {
>> + // RCP: x_{n+1} = 2 * x_n - input * x_n^2
>> + Value *two = bld.getSSA(8);
>> +
>> + bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f));
>> +
>> + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
>> + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
>> + } else {
>> + // RSQ: x_{n+1} = x_n (1.5 - 0.5 * input * x_n^2)
>> + Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8);
>> + bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, -0.5f));
>> + bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 1.5f));
>> +
>> + half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, input);
>> + // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2)
>> + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
>> + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
>> + three_half));
>> + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
>> + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
>> + three_half));
>> + }
>> +
>> + bld.mkMov(def, guess);
>> }
>>
>> bool
>>
>
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results
2015-02-23 15:40 ` Ilia Mirkin
@ 2015-02-23 20:23 ` Ilia Mirkin
0 siblings, 0 replies; 5+ messages in thread
From: Ilia Mirkin @ 2015-02-23 20:23 UTC (permalink / raw)
To: Roland Scheidegger
Cc: mesa-dev@lists.freedesktop.org, nouveau@lists.freedesktop.org
Just to follow up, this is the code the nvidia blob emits for rsq().
The argument is in c1[0] + c1[4]. I _think_ that c3 contains:
PB: 0x7fffffff GF100_M2MF.DATA = 0x7fffffff
PB: 0x7ff00000 GF100_M2MF.DATA = 0x7ff00000
PB: 0x00000000 GF100_M2MF.DATA = 0
PB: 0x3fe00000 GF100_M2MF.DATA = 0x3fe00000
PB: 0x3ff00000 GF100_M2MF.DATA = 0x3ff00000
00000000: 10005de4 28004400 mov b32 $r1 c1[0x4]
00000008: 10109c03 68004c00 and b32 $r2 $r1 c3[0x4]
r2 = exponent bits
00000010: 00101c03 68004c00 and b32 $r0 $r1 c3[0x0]
r0 = non-sign bits (of upper word)
00000018: 1021dc03 1a8e4c00 set $p0 0x1 ne u32 $r2 c3[0x4]
p0 = exponent == 0x7ff (i.e. inf/nan)
00000020: 00001c43 68004400 or b32 $r0 $r0 c1[0x0]
r0 = all non-sign bits or'd
00000028: fc001c04 20000000 selp b32 $r0 $r0 0x0 $p0
if (exponent == 0x7ff)
r0 = all mantissa bits
else
r0 = 0
00000030: fc01dc23 190e0000 set $p0 0x1 eq s32 $r0 0x0
p0 = is input a nan (i.e. exponent = 0x7ff and mantissa bits set)
00000038: 00001de4 28004400 mov b32 $r0 c1[0x0]
00000040: 200081e7 40000001 $p0 bra allwarp 0x90
So all of the (not $p0) stuff happens for all non-nan's, including infinities.
r0d = input
00000048: 0000a1e2 19ff0000 (not $p0) mov b32 $r2 0x7fc00000
00000050: fc0121e4 28000000 (not $p0) mov b32 $r4 0x0
00000058: 1020a103 68004400 (not $p0) and b32 $r2 $r2 not c1[0x4]
00000060: 00216002 08004000 (not $p0) add b32 $r5 $r2 0x100000
00000068: 0450e203 5800c000 (not $p0) shr u32 $r3 $r5 wrap 0x1
00000070: 10002001 50000000 (not $p0) mul rn f64 $r0d $r0d $r4d
00000078: fc00a1e4 28000000 (not $p0) mov b32 $r2 0x0
00000080: 0030e002 087fe000 (not $p0) add b32 $r3 $r3 0x1ff80000
I tried decoding this, but it's some crazy business -- futzing with
the exponent, I think it's trying to compress more into the high
32-bits so that rsqrt64h below has more precision.
00000088: 00001de4 40000000 nop
00000090: fc011de4 28000000 B mov b32 $r4 0x0
00000098: 1c115c00 c8000000 rsqrt64h $r5 $r1
000000a0: 200081e7 40000001 $p0 bra allwarp 0xf0
000000a8: 00002001 5000cff8 (not $p0) mul rn f64 $r0d $r0d
0x3fe0000000000000
r0d = input * 0.5
000000b0: 1001a001 50000000 (not $p0) mul rn f64 $r6d $r0d $r4d
r6d = 0.5 * input * guess
000000b8: 3061a201 20088c00 (not $p0) fma rn f64 $r6d neg $r6d $r4d c3[0xc]
r6d = -0.5 * input * guess * guess + 0.5 (? actually
0.5000001190928742, i.e. 0x3fe000003ff00000... seems like a bug, but
maybe it'll only read it as a f32, not f64 like I'm assuming)
000000c0: 18412001 20080000 (not $p0) fma rn f64 $r4d $r4d $r6d $r4d
guess = guess * (0.5 - 0.5 * input * guess * guess) + guess
i.e. newton-raphson step. [why didn't they just throw in a 1.5 instead
of 0.5? who knows. maybe something to do with numeric stability. or
perhaps a bug in their const upload logic which got papered over with
the extra + guess.]
000000c8: 10002001 50000000 (not $p0) mul rn f64 $r0d $r0d $r4d
000000d0: 30002201 20088c00 (not $p0) fma rn f64 $r0d neg $r0d $r4d c3[0xc]
000000d8: 00402001 20080000 (not $p0) fma rn f64 $r0d $r4d $r0d $r4d
000000e0: 08012001 50000000 (not $p0) mul rn f64 $r4d $r0d $r2d
And this is another step.
Not sure why they're so careful to still go through the motions for
infinity and only skip for nan -- seems like they could just as well
skip it for inf as well, with less code. That's what I plan on doing.
On Mon, Feb 23, 2015 at 10:40 AM, Ilia Mirkin <imirkin@alum.mit.edu> wrote:
> Oh right. I think the NVIDIA blob executes those steps conditionally
> based on the upper bits not being 0x7ff (== infinity/nan). I should do
> the same thing here. [FWIW I was able to test the nv50 code last night
> and that one's a total fail for rcp/rsq... will need to port that over
> to my nvc0 and debug there.]
>
> On Mon, Feb 23, 2015 at 8:24 AM, Roland Scheidegger <sroland@vmware.com> wrote:
>> Does this give correct results for special floats (0, infs)?
>> We tried to improve (for single floats) x86 rcp in llvmpipe with
>> newton-raphson, but unfortunately not being able to give correct results
>> for these two cases (without even more additional code) meant it got all
>> disabled in the end (you can still see that code in the driver) since
>> the problems are at least as bad as those due to bad accuracy...
>>
>> Roland
>>
>> Am 23.02.2015 um 05:01 schrieb Ilia Mirkin:
>>> Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
>>> ---
>>>
>>> Not sure how many steps are needed for the necessary accuracy. Just
>>> doing 2 because that seems like a reasonable number.
>>>
>>> .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 42 ++++++++++++++++++++--
>>> 1 file changed, 39 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>>> index 87e75e1..9767566 100644
>>> --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>>> +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
>>> @@ -77,8 +77,9 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
>>> bld.setPosition(i, false);
>>>
>>> // 1. Take the source and it up.
>>> - Value *src[2], *dst[2], *def = i->getDef(0);
>>> - bld.mkSplit(src, 4, i->getSrc(0));
>>> + Value *input = i->getSrc(0);
>>> + Value *src[2], *dst[2], *guess, *def = i->getDef(0);
>>> + bld.mkSplit(src, 4, input);
>>>
>>> // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
>>> dst[0] = bld.loadImm(NULL, 0);
>>> @@ -93,7 +94,42 @@ NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
>>>
>>> // 4. Recombine the two dst pieces back into the original destination.
>>> bld.setPosition(i, true);
>>> - bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
>>> + guess = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), dst[0], dst[1]);
>>> +
>>> + // 5. Perform 2 Newton-Raphson steps
>>> + if (i->op == OP_RCP) {
>>> + // RCP: x_{n+1} = 2 * x_n - input * x_n^2
>>> + Value *two = bld.getSSA(8);
>>> +
>>> + bld.mkCvt(OP_CVT, TYPE_F64, two, TYPE_F32, bld.loadImm(NULL, 2.0f));
>>> +
>>> + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
>>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
>>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
>>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
>>> + guess = bld.mkOp2v(OP_SUB, TYPE_F64, bld.getSSA(8),
>>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), two, guess),
>>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), input,
>>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess)));
>>> + } else {
>>> + // RSQ: x_{n+1} = x_n (1.5 - 0.5 * input * x_n^2)
>>> + Value *half_input = bld.getSSA(8), *three_half = bld.getSSA(8);
>>> + bld.mkCvt(OP_CVT, TYPE_F64, half_input, TYPE_F32, bld.loadImm(NULL, -0.5f));
>>> + bld.mkCvt(OP_CVT, TYPE_F64, three_half, TYPE_F32, bld.loadImm(NULL, 1.5f));
>>> +
>>> + half_input = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), half_input, input);
>>> + // RSQ: x_{n+1} = x_n * (1.5 - 0.5 * input * x_n^2)
>>> + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
>>> + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
>>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
>>> + three_half));
>>> + guess = bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess,
>>> + bld.mkOp3v(OP_MAD, TYPE_F64, bld.getSSA(8), half_input,
>>> + bld.mkOp2v(OP_MUL, TYPE_F64, bld.getSSA(8), guess, guess),
>>> + three_half));
>>> + }
>>> +
>>> + bld.mkMov(def, guess);
>>> }
>>>
>>> bool
>>>
>>
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2015-02-23 20:23 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-02-23 4:01 [PATCH 1/2] nv50/ir: add fp64 support on G200 (NVA0) Ilia Mirkin
[not found] ` <1424664088-14913-1-git-send-email-imirkin-FrUbXkNCsVf2fBVCVOL8/A@public.gmane.org>
2015-02-23 4:01 ` [PATCH 2/2] nvc0/ir: improve precision of double RCP/RSQ results Ilia Mirkin
2015-02-23 13:24 ` Roland Scheidegger
2015-02-23 15:40 ` Ilia Mirkin
2015-02-23 20:23 ` Ilia Mirkin
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.