Browse Source

ARM: Add fast assembler implementation of floor/ceil/trunc.

Mike Pall 14 years ago
parent
commit
a48058a791
5 changed files with 285 additions and 166 deletions
  1. 1 1
      src/Makefile.dep
  2. 60 13
      src/buildvm_arm.dasc
  3. 212 147
      src/buildvm_arm.h
  4. 3 2
      src/lib_math.c
  5. 9 3
      src/lj_vm.h

+ 1 - 1
src/Makefile.dep

@@ -34,7 +34,7 @@ lib_jit.o: lib_jit.c lua.h luaconf.h lauxlib.h lualib.h lj_arch.h \
  lj_jit.h lj_ircall.h lj_iropt.h lj_target.h lj_target_*.h \
  lj_jit.h lj_ircall.h lj_iropt.h lj_target.h lj_target_*.h \
  lj_dispatch.h lj_vm.h lj_vmevent.h lj_lib.h luajit.h lj_libdef.h
  lj_dispatch.h lj_vm.h lj_vmevent.h lj_lib.h luajit.h lj_libdef.h
 lib_math.o: lib_math.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \
 lib_math.o: lib_math.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \
- lj_def.h lj_arch.h lj_lib.h lj_libdef.h
+ lj_def.h lj_arch.h lj_lib.h lj_vm.h lj_libdef.h
 lib_os.o: lib_os.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h lj_def.h \
 lib_os.o: lib_os.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h lj_def.h \
  lj_arch.h lj_err.h lj_errmsg.h lj_lib.h lj_libdef.h
  lj_arch.h lj_err.h lj_errmsg.h lj_lib.h lj_libdef.h
 lib_package.o: lib_package.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \
 lib_package.o: lib_package.c lua.h luaconf.h lauxlib.h lualib.h lj_obj.h \

+ 60 - 13
src/buildvm_arm.dasc

@@ -1308,10 +1308,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  movmi CARG1, #0x80000000
   |  movmi CARG1, #0x80000000
   |  bmi <1
   |  bmi <1
   |4:
   |4:
-  |  // NYI: Use internal implementation.
-  |  IOS mov RA, BASE
-  |  bl extern func
-  |  IOS mov BASE, RA
+  |  bl ->vm_..func
   |  b ->fff_restv
   |  b ->fff_restv
   |.endmacro
   |.endmacro
   |
   |
@@ -2010,23 +2007,76 @@ static void build_subroutines(BuildCtx *ctx)
   |// double lj_vm_floor/ceil/trunc(double x);
   |// double lj_vm_floor/ceil/trunc(double x);
   |.macro vm_round, func
   |.macro vm_round, func
   |->vm_ .. func:
   |->vm_ .. func:
-  |  // NYI: Use internal implementation.
-  |  b extern func
+  |  lsl CARG3, CARG2, #1
+  |  adds RB, CARG3, #0x00200000
+  |  bpl >2				// |x| < 1?
+  |  mvn CARG4, #0x3cc
+  |  subs RB, CARG4, RB, asr #21	// 2^0: RB = 51, 2^51: RB = 0.
+  |  bxlo lr				// |x| >= 2^52: done.
+  |  mvn CARG4, #1
+  |   bic CARG3, CARG1, CARG4, lsl RB	// ztest = lo & ~lomask
+  |  and CARG1, CARG1, CARG4, lsl RB	// lo &= lomask
+  |  subs RB, RB, #32
+  |   bicpl CARG4, CARG2, CARG4, lsl RB	// |x| <= 2^20: ztest |= hi & ~himask
+  |   orrpl CARG3, CARG3, CARG4
+  |   mvnpl CARG4, #1
+  |  andpl CARG2, CARG2, CARG4, lsl RB	// |x| <= 2^20: hi &= himask
+  |.if "func" == "floor"
+  |   tst CARG3, CARG2, asr #31		// iszero = ((ztest & signmask) == 0)
+  |.else
+  |   bics CARG3, CARG3, CARG2, asr #31	// iszero = ((ztest & ~signmask) == 0)
+  |.endif
+  |  bxeq lr				// iszero: done.
+  |  mvn CARG4, #1
+  |  cmp RB, #0
+  |  lslpl CARG3, CARG4, RB
+  |  mvnmi CARG3, #0
+  |  add RB, RB, #32
+  |  subs CARG1, CARG1, CARG4, lsl RB	// lo = lo-lomask
+  |  sbc CARG2, CARG2, CARG3		// hi = hi-himask+carry
+  |  bx lr
+  |
+  |2:  // |x| < 1:
+  |  orr CARG3, CARG3, CARG1		// ztest = (2*hi) | lo
+  |.if "func" == "floor"
+  |  tst CARG3, CARG2, asr #31		// iszero = ((ztest & signmask) == 0)
+  |.else
+  |  bics CARG3, CARG3, CARG2, asr #31	// iszero = ((ztest & ~signmask) == 0)
+  |.endif
+  |  mov CARG1, #0			// lo = 0
+  |  and CARG2, CARG2, #0x80000000
+  |  ldrne CARG4, <9			// hi = sign(x) | (iszero ? 0.0 : 1.0)
+  |  orrne CARG2, CARG2, CARG4
+  |  bx lr
   |.endmacro
   |.endmacro
   |
   |
+  |9:
+  |  .long 0x3ff00000			// hiword(1.0)
   |  vm_round floor
   |  vm_round floor
   |  vm_round ceil
   |  vm_round ceil
-#if LJ_HASJIT
-  |  vm_round trunc
-#else
+  |
   |->vm_trunc:
   |->vm_trunc:
+#if LJ_HASJIT
+  |  lsl CARG3, CARG2, #1
+  |  adds RB, CARG3, #0x00200000
+  |  andpl CARG2, CARG2, #0x80000000	// |x| < 1? hi = sign(x), lo = 0.
+  |  movpl CARG1, #0
+  |  bxpl lr
+  |  mvn CARG4, #0x3cc
+  |  subs RB, CARG4, RB, asr #21	// 2^0: RB = 51, 2^51: RB = 0.
+  |  bxlo lr				// |x| >= 2^52: already done.
+  |  mvn CARG4, #1
+  |  and CARG1, CARG1, CARG4, lsl RB	// lo &= lomask
+  |  subs RB, RB, #32
+  |  andpl CARG2, CARG2, CARG4, lsl RB	// |x| <= 2^20: hi &= himask
+  |  bx lr
 #endif
 #endif
   |
   |
   |  // double lj_vm_mod(double dividend, double divisor);
   |  // double lj_vm_mod(double dividend, double divisor);
   |->vm_mod:
   |->vm_mod:
   |  push {r0, r1, r2, r3, r4, lr}
   |  push {r0, r1, r2, r3, r4, lr}
   |  bl extern __aeabi_ddiv
   |  bl extern __aeabi_ddiv
-  |  bl extern floor  // NYI: Use internal implementation of floor.
+  |  bl ->vm_floor
   |  ldrd CARG34, [sp, #8]
   |  ldrd CARG34, [sp, #8]
   |  bl extern __aeabi_dmul
   |  bl extern __aeabi_dmul
   |  ldrd CARG34, [sp]
   |  ldrd CARG34, [sp]
@@ -2586,9 +2636,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |5:  // FP variant.
     |5:  // FP variant.
     |  ins_arithfallback ins_arithcheck_num
     |  ins_arithfallback ins_arithcheck_num
     |.if "intins" == "vm_modi"
     |.if "intins" == "vm_modi"
-    |  IOS mov RC, BASE
     |  bl fpcall
     |  bl fpcall
-    |  IOS mov BASE, RC  // NYI: remove once we use internal impl. of floor.
     |.else
     |.else
     |  bl fpcall
     |  bl fpcall
     |   ins_next1
     |   ins_next1
@@ -3966,7 +4014,6 @@ static void emit_asm_debug(BuildCtx *ctx)
     fprintf(ctx->fp,
     fprintf(ctx->fp,
 	"\t.align 2\n"
 	"\t.align 2\n"
 	".LEFDE0:\n\n");
 	".LEFDE0:\n\n");
-    /* NYI: emit ARM.exidx. */
     break;
     break;
   default:
   default:
     break;
     break;

File diff suppressed because it is too large
+ 212 - 147
src/buildvm_arm.h


+ 3 - 2
src/lib_math.c

@@ -14,6 +14,7 @@
 
 
 #include "lj_obj.h"
 #include "lj_obj.h"
 #include "lj_lib.h"
 #include "lj_lib.h"
+#include "lj_vm.h"
 
 
 /* ------------------------------------------------------------------------ */
 /* ------------------------------------------------------------------------ */
 
 
@@ -162,7 +163,7 @@ LJLIB_CF(math_random)		LJLIB_REC(.)
     double r1 = lj_lib_checknum(L, 1);
     double r1 = lj_lib_checknum(L, 1);
 #endif
 #endif
     if (n == 1) {
     if (n == 1) {
-      d = floor(d*r1) + 1.0;  /* d is an int in range [1, r1] */
+      d = lj_vm_floor(d*r1) + 1.0;  /* d is an int in range [1, r1] */
     } else {
     } else {
 #if LJ_DUALNUM
 #if LJ_DUALNUM
       double r2;
       double r2;
@@ -176,7 +177,7 @@ LJLIB_CF(math_random)		LJLIB_REC(.)
 #else
 #else
       double r2 = lj_lib_checknum(L, 2);
       double r2 = lj_lib_checknum(L, 2);
 #endif
 #endif
-      d = floor(d*(r2-r1+1.0)) + r1;  /* d is an int in range [r1, r2] */
+      d = lj_vm_floor(d*(r2-r1+1.0)) + r1;  /* d is an int in range [r1, r2] */
     }
     }
 #if LJ_DUALNUM
 #if LJ_DUALNUM
     if (isint) {
     if (isint) {

+ 9 - 3
src/lj_vm.h

@@ -45,7 +45,15 @@ LJ_ASMF void lj_vm_callhook(void);
 LJ_ASMF void lj_vm_exit_handler(void);
 LJ_ASMF void lj_vm_exit_handler(void);
 LJ_ASMF void lj_vm_exit_interp(void);
 LJ_ASMF void lj_vm_exit_interp(void);
 
 
-/* Handlers callable from compiled code. */
+/* Internal math helper functions. */
+#if LJ_TARGET_X86ORX64
+#define lj_vm_floor(x)	floor(x)
+#define lj_vm_ceil(x)	ceil(x)
+#else
+LJ_ASMF double lj_vm_floor(double);
+LJ_ASMF double lj_vm_ceil(double);
+#endif
+
 #if LJ_HASJIT
 #if LJ_HASJIT
 #if LJ_TARGET_X86ORX64
 #if LJ_TARGET_X86ORX64
 LJ_ASMF void lj_vm_floor_sse(void);
 LJ_ASMF void lj_vm_floor_sse(void);
@@ -56,8 +64,6 @@ LJ_ASMF void lj_vm_exp2_x87(void);
 LJ_ASMF void lj_vm_pow_sse(void);
 LJ_ASMF void lj_vm_pow_sse(void);
 LJ_ASMF void lj_vm_powi_sse(void);
 LJ_ASMF void lj_vm_powi_sse(void);
 #else
 #else
-LJ_ASMF double lj_vm_floor(double);
-LJ_ASMF double lj_vm_ceil(double);
 LJ_ASMF double lj_vm_trunc(double);
 LJ_ASMF double lj_vm_trunc(double);
 LJ_ASMF double lj_vm_powi(double, int32_t);
 LJ_ASMF double lj_vm_powi(double, int32_t);
 #if defined(__ANDROID__) || defined(__symbian__)
 #if defined(__ANDROID__) || defined(__symbian__)

Some files were not shown because too many files changed in this diff