2 роки тому · de2e1ca9d3
--- a/doc/running.html
+++ b/doc/running.html
@@ -220,6 +220,12 @@ mix the three forms, but note that setting an optimization level
 
				 overrides all earlier flags.
			
 
				 </p>
			
 
				 <p>
			
 
				+Note that <tt>-Ofma</tt> is not enabled by default at any level,
			
 
				+because it affects floating-point result accuracy. Only enable this,
			
 
				+if you fully understand the trade-offs of FMA for performance (higher),
			
 
				+determinism (lower) and numerical accuracy (higher).
			
 
				+</p>
			
 
				+<p>
			
 
				 Here are the available flags and at what optimization levels they
			
 
				 are enabled:
			
 
				 </p>
			
@@ -251,6 +257,8 @@ are enabled:
 
				 <td class="flag_name">sink</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&bull;</td><td class="flag_desc">Allocation/Store Sinking</td></tr>
			
 
				 <tr class="even">
			
 
				 <td class="flag_name">fuse</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&bull;</td><td class="flag_desc">Fusion of operands into instructions</td></tr>
			
 
				+<tr class="odd">
			
 
				+<td class="flag_name">fma </td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_level">&nbsp;</td><td class="flag_desc">Fused multiply-add</td></tr>
			
 
				 </table>
			
 
				 <p>
			
 
				 Here are the parameters and their default settings:
			
--- a/src/lj_asm_arm.h
+++ b/src/lj_asm_arm.h
@@ -313,7 +313,11 @@ static void asm_fusexref(ASMState *as, ARMIns ai, Reg rd, IRRef ref,
 
				 }
			
 
				 
			
 
				 #if !LJ_SOFTFP
			
 
				-/* Fuse to multiply-add/sub instruction. */
			
 
				+/*
			
 
				+** Fuse to multiply-add/sub instruction.
			
 
				+** VMLA rounds twice (UMA, not FMA) -- no need to check for JIT_F_OPT_FMA.
			
 
				+** VFMA needs VFPv4, which is uncommon on the remaining ARM32 targets.
			
 
				+*/
			
 
				 static int asm_fusemadd(ASMState *as, IRIns *ir, ARMIns ai, ARMIns air)
			
 
				 {
			
 
				   IRRef lref = ir->op1, rref = ir->op2;
			
--- a/src/lj_asm_arm64.h
+++ b/src/lj_asm_arm64.h
@@ -337,7 +337,8 @@ static int asm_fusemadd(ASMState *as, IRIns *ir, A64Ins ai, A64Ins air)
 
				 {
			
 
				   IRRef lref = ir->op1, rref = ir->op2;
			
 
				   IRIns *irm;
			
 
				-  if (lref != rref &&
			
 
				+  if ((as->flags & JIT_F_OPT_FMA) &&
			
 
				+      lref != rref &&
			
 
				       ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) &&
			
 
				        ra_noreg(irm->r)) ||
			
 
				        (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) &&
			
--- a/src/lj_asm_ppc.h
+++ b/src/lj_asm_ppc.h
@@ -235,7 +235,8 @@ static int asm_fusemadd(ASMState *as, IRIns *ir, PPCIns pi, PPCIns pir)
 
				 {
			
 
				   IRRef lref = ir->op1, rref = ir->op2;
			
 
				   IRIns *irm;
			
 
				-  if (lref != rref &&
			
 
				+  if ((as->flags & JIT_F_OPT_FMA) &&
			
 
				+      lref != rref &&
			
 
				       ((mayfuse(as, lref) && (irm = IR(lref), irm->o == IR_MUL) &&
			
 
				 	ra_noreg(irm->r)) ||
			
 
				        (mayfuse(as, rref) && (irm = IR(rref), irm->o == IR_MUL) &&
			
--- a/src/lj_jit.h
+++ b/src/lj_jit.h
@@ -87,10 +87,11 @@
 
				 #define JIT_F_OPT_ABC		(JIT_F_OPT << 7)
			
 
				 #define JIT_F_OPT_SINK		(JIT_F_OPT << 8)
			
 
				 #define JIT_F_OPT_FUSE		(JIT_F_OPT << 9)
			
 
				+#define JIT_F_OPT_FMA		(JIT_F_OPT << 10)
			
 
				 
			
 
				 /* Optimizations names for -O. Must match the order above. */
			
 
				 #define JIT_F_OPTSTRING	\
			
 
				-  "\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4sink\4fuse"
			
 
				+  "\4fold\3cse\3dce\3fwd\3dse\6narrow\4loop\3abc\4sink\4fuse\3fma"
			
 
				 
			
 
				 /* Optimization levels set a fixed combination of flags. */
			
 
				 #define JIT_F_OPT_0	0
			
@@ -99,6 +100,7 @@
 
				 #define JIT_F_OPT_3	(JIT_F_OPT_2|\
			
 
				   JIT_F_OPT_FWD|JIT_F_OPT_DSE|JIT_F_OPT_ABC|JIT_F_OPT_SINK|JIT_F_OPT_FUSE)
			
 
				 #define JIT_F_OPT_DEFAULT	JIT_F_OPT_3
			
 
				+/* Note: FMA is not set by default. */
			
 
				 
			
 
				 /* -- JIT engine parameters ----------------------------------------------- */
			
 
				 
			
--- a/src/lj_vmmath.c
+++ b/src/lj_vmmath.c
@@ -36,6 +36,17 @@ LJ_FUNCA double lj_wrap_fmod(double x, double y) { return fmod(x, y); }
 
				 
			
 
				 /* -- Helper functions ---------------------------------------------------- */
			
 
				 
			
 
				+/* Required to prevent the C compiler from applying FMA optimizations.
			
 
				+**
			
 
				+** Yes, there's -ffp-contract and the FP_CONTRACT pragma ... in theory.
			
 
				+** But the current state of C compilers is a mess in this regard.
			
 
				+** Also, this function is not performance sensitive at all.
			
 
				+*/
			
 
				+LJ_NOINLINE static double lj_vm_floormul(double x, double y)
			
 
				+{
			
 
				+  return lj_vm_floor(x / y) * y;
			
 
				+}
			
 
				+
			
 
				 double lj_vm_foldarith(double x, double y, int op)
			
 
				 {
			
 
				   switch (op) {
			
@@ -43,7 +54,7 @@ double lj_vm_foldarith(double x, double y, int op)
 
				   case IR_SUB - IR_ADD: return x-y; break;
			
 
				   case IR_MUL - IR_ADD: return x*y; break;
			
 
				   case IR_DIV - IR_ADD: return x/y; break;
			
 
				-  case IR_MOD - IR_ADD: return x-lj_vm_floor(x/y)*y; break;
			
 
				+  case IR_MOD - IR_ADD: return x-lj_vm_floormul(x, y); break;
			
 
				   case IR_POW - IR_ADD: return pow(x, y); break;
			
 
				   case IR_NEG - IR_ADD: return -x; break;
			
 
				   case IR_ABS - IR_ADD: return fabs(x); break;
			
--- a/src/vm_arm64.dasc
+++ b/src/vm_arm64.dasc
@@ -2636,7 +2636,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
 
				     |.macro ins_arithmod, res, reg1, reg2
			
 
				     |  fdiv d2, reg1, reg2
			
 
				     |  frintm d2, d2
			
 
				-    |  fmsub res, d2, reg2, reg1
			
 
				+    |  // Cannot use fmsub, because FMA is not enabled by default.
			
 
				+    |  fmul d2, d2, reg2
			
 
				+    |  fsub res, reg1, d2
			
 
				     |.endmacro
			
 
				     |
			
 
				     |.macro ins_arithdn, intins, fpins