Parcourir la source

* patch from Max Nazhalov with improvements to the i8086 64-bit division:
since [unnormalized] n >= 0x10000 now, we have
1. lzv<=47, so all code that makes shifts by >= 48 is dead;
2. q3=0, so main loop can be executed just 3 times instead of 4.

git-svn-id: trunk@26535 -

nickysn il y a 11 ans
Parent
commit
a85bb98aa6
1 fichiers modifiés avec 19 ajouts et 55 suppressions
  1. 19 55
      rtl/i8086/int64p.inc

+ 19 - 55
rtl/i8086/int64p.inc

@@ -232,10 +232,10 @@ function fpc_div_qword( n, z: qword ): qword; [public, alias:'FPC_DIV_QWORD']; c
 // see [D.Knuth, TAOCP, vol.2, sect.4.3.1] for explanation
 var
   dig: byte;
-  u: array [0..7] of word;
+  u: array [0..6] of word;
 begin
   asm
-      mov    dig,4
+      mov    dig,3 // quotient contains 3 digits for "long" division path
       // Check parameters
       mov    dx,word [n]
       mov    cx,word [n+2]
@@ -284,6 +284,8 @@ begin
       jmp    @@q
 @@n0: // D1. Normalize divisor:
       //   n := n shl lzv, so that 2^63<=n<2^64
+      // Note: n>=0x10000 leads to lzv<=47 and q3=0
+      mov    word [result+6],si // q3:=0
       mov    di,si
       test   ax,ax
       jnz    @@n2
@@ -316,7 +318,7 @@ begin
       mov    word [n+4],bx
       mov    word [n+6],ax
       // Adjust divident accordingly:
-      //   u := uint128(z) shl lzv; lzv=si=0..63; di=0
+      //   u := uint128(z) shl lzv; lzv=si=0..47; di=0
       mov    dx,word [z]
       mov    cx,word [z+2]
       mov    bx,word [z+4]
@@ -347,25 +349,12 @@ begin
       dec    si
       jnz    @@m1
 @@m2: // si=0, bp=lzv
-      // di:ax:bx:cx:dx shifted by 0..15; 0|16|32|48 shifts remain
+      // di:ax:bx:cx:dx shifted by 0..15; 0|16|32 shifts remain
       sub    bp,16
       jc     @@m5
       sub    bp,16
       jc     @@m4
-      sub    bp,16
-      jc     @@m3
-      // << 48
-      pop    bp
-      mov    word [u],si
-      mov    word [u+2],si
-      mov    word [u+4],si
-      mov    word [u+6],dx
-      mov    word [u+8],cx
-      mov    word [u+10],bx
-      mov    word [u+12],ax
-      mov    word [u+14],di
-      jmp    @@m6
-@@m3: // << 32
+      // << 32
       pop    bp
       mov    word [u],si
       mov    word [u+2],si
@@ -374,7 +363,6 @@ begin
       mov    word [u+8],bx
       mov    word [u+10],ax
       mov    word [u+12],di
-      mov    word [u+14],si
       jmp    @@m6
 @@m4: // << 16
       pop    bp
@@ -385,7 +373,6 @@ begin
       mov    word [u+8],ax
       mov    word [u+10],di
       mov    word [u+12],si
-      mov    word [u+14],si
       jmp    @@m6
 @@m5: // << 0
       pop    bp
@@ -396,10 +383,9 @@ begin
       mov    word [u+8],di
       mov    word [u+10],si
       mov    word [u+12],si
-      mov    word [u+14],si
-@@m6: // D2. Start from j:=3, si:=@u[j], bx:=@q[j]
-      lea    si,word [u+6]
-      lea    bx,word [result+6]
+@@m6: // D2. Start from j:=2 (since u7=0 and u6<n3), si:=@u[j], bx:=@q[j]
+      lea    si,word [u+4]
+      lea    bx,word [result+4]
 @@d0: push   bx
       // D3. Estimate the next quotient digit:
       //   q_hat := [u(j+4):u(j+3)]/[n3]
@@ -478,10 +464,10 @@ function fpc_mod_qword( n, z: qword ): qword; [public, alias:'FPC_MOD_QWORD']; c
 var
   dig: byte;
   lzv: word;
-  u: array [0..7] of word;
+  u: array [0..6] of word;
 begin
   asm
-      mov    dig,4
+      mov    dig,3 // quotient contains 3 digist for "long" division path
       // Check parameters
       mov    dx,word [n]
       mov    cx,word [n+2]
@@ -530,6 +516,7 @@ begin
       jmp    @@r6
 @@n0: // D1. Normalize divisor:
       //   n := n shl lzv, so that 2^63<=n<2^64
+      // Note: n>=0x10000 leads to lzv<=47
       mov    di,si
       test   ax,ax
       jnz    @@n2
@@ -563,7 +550,7 @@ begin
       mov    word [n+6],ax
       mov    lzv,si
       // Adjust divident accordingly:
-      //   u := uint128(z) shl lzv; lzv=si=0..63; di=0
+      //   u := uint128(z) shl lzv; lzv=si=0..47; di=0
       mov    dx,word [z]
       mov    cx,word [z+2]
       mov    bx,word [z+4]
@@ -594,25 +581,12 @@ begin
       dec    si
       jnz    @@m1
 @@m2: // si=0, bp=lzv
-      // di:ax:bx:cx:dx shifted by 0..15; 0|16|32|48 shifts remain
+      // di:ax:bx:cx:dx shifted by 0..15; 0|16|32 shifts remain
       sub    bp,16
       jc     @@m5
       sub    bp,16
       jc     @@m4
-      sub    bp,16
-      jc     @@m3
-      // << 48
-      pop    bp
-      mov    word [u],si
-      mov    word [u+2],si
-      mov    word [u+4],si
-      mov    word [u+6],dx
-      mov    word [u+8],cx
-      mov    word [u+10],bx
-      mov    word [u+12],ax
-      mov    word [u+14],di
-      jmp    @@m6
-@@m3: // << 32
+      // << 32
       pop    bp
       mov    word [u],si
       mov    word [u+2],si
@@ -621,7 +595,6 @@ begin
       mov    word [u+8],bx
       mov    word [u+10],ax
       mov    word [u+12],di
-      mov    word [u+14],si
       jmp    @@m6
 @@m4: // << 16
       pop    bp
@@ -632,7 +605,6 @@ begin
       mov    word [u+8],ax
       mov    word [u+10],di
       mov    word [u+12],si
-      mov    word [u+14],si
       jmp    @@m6
 @@m5: // << 0
       pop    bp
@@ -643,9 +615,8 @@ begin
       mov    word [u+8],di
       mov    word [u+10],si
       mov    word [u+12],si
-      mov    word [u+14],si
-@@m6: // D2. Start from j:=3, si:=@u[j]
-      lea    si,word [u+6]
+@@m6: // D2. Start from j:=2 (since u7=0 and u6<n3), si:=@u[j]
+      lea    si,word [u+4]
 @@d0: // D3. Estimate the next quotient digit:
       //   q_hat := [u(j+4):u(j+3)]/[n3]
       //   use max.possible q_hat if division overflows
@@ -711,14 +682,7 @@ begin
       jc     @@r2
       sub    si,16
       jc     @@r1
-      sub    si,16
-      jc     @@r0
-      // >> 48..63
-      mov    bx,ax
-      mov    cx,ax
-      mov    dx,word [u+6]
-      jmp    @@r3
-@@r0: // >> 32..47
+      // >> 32..47
       mov    bx,ax
       mov    cx,word [u+6]
       mov    dx,word [u+4]