Browse Source

+ added asm optimized 64-bit division helpers for i8086 by Max Nazhalov

git-svn-id: trunk@26520 -
nickysn 11 years ago
parent
commit
b91509159a
1 changed files with 539 additions and 0 deletions
  1. 539 0
      rtl/i8086/int64p.inc

+ 539 - 0
rtl/i8086/int64p.inc

@@ -225,3 +225,542 @@ asm
   pop     bp
   pop     bp
 end;
 end;
 
 
+
+{$define FPC_SYSTEM_HAS_DIV_QWORD}
+function fpc_div_qword( n, z: qword ): qword; [public, alias:'FPC_DIV_QWORD']; compilerproc;
+// Generic "schoolbook" division algorithm
+// see [D.Knuth, TAOCP, vol.2, sect.4.3.1] for explanation
+var
+  dig: byte;
+  u: array [0..7] of word;
+begin
+  asm
+      mov    dig,4
+      // Check parameters
+      mov    dx,word [n]
+      mov    cx,word [n+2]
+      mov    bx,word [n+4]
+      mov    ax,word [n+6]
+      mov    di,ax
+      or     di,bx
+      or     di,cx
+      jnz    @@s1
+      or     di,dx
+      jz     @@q // div by 0
+      // Short division
+      mov    dig,al
+      mov    dx,word [z+6]
+      cmp    dx,di
+      jc     @@s0
+      xchg   ax,dx
+      div    di
+@@s0: mov    word [result+6],ax
+      mov    ax,word [z+4]
+      div    di
+      mov    word [result+4],ax
+      mov    ax,word [z+2]
+      div    di
+      mov    word [result+2],ax
+      mov    ax,word [z]
+      div    di
+      mov    word [result],ax
+      jmp    @@q
+@@s1: // Long division
+      xor    si,si
+      cmp    word [z],dx
+      mov    di,word [z+2]
+      sbb    di,cx
+      mov    di,word [z+4]
+      sbb    di,bx
+      mov    di,word [z+6]
+      sbb    di,ax
+      jnc    @@n0
+      // underflow: return 0
+      mov    dig,0
+      mov    word [result],si
+      mov    word [result+2],si
+      mov    word [result+4],si
+      mov    word [result+6],si
+      jmp    @@q
+@@n0: // D1. Normalize divisor:
+      //   n := n shl lzv, so that 2^63<=n<2^64
+      mov    di,si
+      test   ax,ax
+      jnz    @@n2
+@@n1: add    si,16
+      or     ax,bx
+      mov    bx,cx
+      mov    cx,dx
+      mov    dx,di
+      jz     @@n1
+@@n2: test   ah,ah
+      jnz    @@n4
+      add    si,8
+      or     ah,al
+      mov    al,bh
+      mov    bh,bl
+      mov    bl,ch
+      mov    ch,cl
+      mov    cl,dh
+      mov    dh,dl
+      mov    dl,0
+      js     @@n5
+@@n3: inc    si
+      shl    dx,1
+      rcl    cx,1
+      rcl    bx,1
+      adc    ax,ax
+@@n4: jns    @@n3
+@@n5: mov    word [n],dx 
+      mov    word [n+2],cx
+      mov    word [n+4],bx
+      mov    word [n+6],ax
+      // Adjust divident accordingly:
+      //   u := uint128(z) shl lzv; lzv=si=0..63; di=0
+      mov    dx,word [z]
+      mov    cx,word [z+2]
+      mov    bx,word [z+4]
+      mov    ax,word [z+6]
+      push   bp
+      mov    bp,si // save lzv
+      test   si,8
+      jz     @@m0
+      // << by odd-8
+      xchg   al,ah
+      mov    di,ax
+      and    di,0FFh
+      mov    al,bh
+      mov    bh,bl
+      mov    bl,ch
+      mov    ch,cl
+      mov    cl,dh
+      mov    dh,dl
+      xor    dl,dl
+@@m0: and    si,7
+      jz     @@m2
+      // << 1..7
+@@m1: shl    dx,1
+      rcl    cx,1
+      rcl    bx,1
+      rcl    ax,1
+      rcl    di,1
+      dec    si
+      jnz    @@m1
+@@m2: // si=0, bp=lzv
+      // di:ax:bx:cx:dx shifted by 0..15; 0|16|32|48 shifts remain
+      sub    bp,16
+      jc     @@m5
+      sub    bp,16
+      jc     @@m4
+      sub    bp,16
+      jc     @@m3
+      // << 48
+      pop    bp
+      mov    word [u],si
+      mov    word [u+2],si
+      mov    word [u+4],si
+      mov    word [u+6],dx
+      mov    word [u+8],cx
+      mov    word [u+10],bx
+      mov    word [u+12],ax
+      mov    word [u+14],di
+      jmp    @@m6
+@@m3: // << 32
+      pop    bp
+      mov    word [u],si
+      mov    word [u+2],si
+      mov    word [u+4],dx
+      mov    word [u+6],cx
+      mov    word [u+8],bx
+      mov    word [u+10],ax
+      mov    word [u+12],di
+      mov    word [u+14],si
+      jmp    @@m6
+@@m4: // << 16
+      pop    bp
+      mov    word [u],si
+      mov    word [u+2],dx
+      mov    word [u+4],cx
+      mov    word [u+6],bx
+      mov    word [u+8],ax
+      mov    word [u+10],di
+      mov    word [u+12],si
+      mov    word [u+14],si
+      jmp    @@m6
+@@m5: // << 0
+      pop    bp
+      mov    word [u],dx
+      mov    word [u+2],cx
+      mov    word [u+4],bx
+      mov    word [u+6],ax
+      mov    word [u+8],di
+      mov    word [u+10],si
+      mov    word [u+12],si
+      mov    word [u+14],si
+@@m6: // D2. Start from j:=3, si:=@u[j], bx:=@q[j]
+      lea    si,word [u+6]
+      lea    bx,word [result+6]
+@@d0: push   bx
+      // D3. Estimate the next quotient digit:
+      //   q_hat := [u(j+4):u(j+3)]/[n3]
+      //   use max.possible q_hat if division overflows
+      mov    ax,-1
+      mov    dx,ss:[si+8]
+      mov    di,word [n+6]
+      cmp    dx,di
+      jnc    @@d1
+      mov    ax,ss:[si+6]
+      div    di
+@@d1: // D4. Multiply & subtract calculating partial reminder:
+      //   r := [u(j+4):u(j+3):u(j+2):u(j+1):u(j)]-q_hat*[n3:n2:n1:n0]
+      push   ax // q_hat
+      push   si // @u[j]
+      mov    si,ax
+      mul    word [n]
+      mov    bx,ax
+      mov    cx,dx
+      mov    ax,word [n+2]
+      mul    si
+      add    cx,ax
+      adc    dx,0
+      mov    di,dx
+      mov    ax,word [n+4]
+      mul    si
+      add    di,ax
+      adc    dx,0
+      xchg   dx,si
+      mov    ax,word [n+6]
+      mul    dx
+      add    ax,si
+      pop    si // @u[j]
+      adc    dx,0
+      sub    ss:[si],bx
+      sbb    ss:[si+2],cx
+      sbb    ss:[si+4],di
+      sbb    ss:[si+6],ax
+      sbb    ss:[si+8],dx
+      pop    di // q_hat
+      // D5. Test reminder
+      jnc    @@d3 // 0<=r<n
+      // D6. Add back once or twice correcting the quotient and remainder:
+      //   while (r<0) do { q_hat--; r+=n; }
+      mov    dx,word [n]
+      mov    cx,word [n+2]
+      mov    bx,word [n+4]
+      mov    ax,word [n+6] 
+@@d2: dec    di
+      add    ss:[si],dx
+      adc    ss:[si+2],cx
+      adc    ss:[si+4],bx
+      adc    ss:[si+6],ax
+      adc    word ss:[si+8],0
+      jnc    @@d2
+@@d3: // D7. Store q[j], loop on j--
+      pop    bx // @q[j]
+      dec    si
+      dec    si
+      mov    ss:[bx],di
+      dec    bx
+      dec    bx
+      dec    dig
+      jnz    @@d0
+@@q:
+  end;
+  if dig<>0 then
+    HandleErrorAddrFrameInd(200,get_pc_addr,get_frame);
+end;
+
+
+{$define FPC_SYSTEM_HAS_MOD_QWORD}
+function fpc_mod_qword( n, z: qword ): qword; [public, alias:'FPC_MOD_QWORD']; compilerproc;
+// Generic "schoolbook" division algorithm
+// see [D.Knuth, TAOCP, vol.2, sect.4.3.1] for explanation
+var
+  dig: byte;
+  lzv: word;
+  u: array [0..7] of word;
+begin
+  asm
+      mov    dig,4
+      // Check parameters
+      mov    dx,word [n]
+      mov    cx,word [n+2]
+      mov    bx,word [n+4]
+      mov    ax,word [n+6]
+      mov    di,ax
+      or     di,bx
+      or     di,cx
+      jnz    @@s1
+      or     di,dx
+      jz     @@q // div by 0
+      // Short division
+      mov    dig,al
+      mov    dx,word [z+6]
+      cmp    dx,di
+      jc     @@s0
+      xchg   ax,dx
+      div    di
+@@s0: mov    ax,word [z+4]
+      div    di
+      mov    ax,word [z+2]
+      div    di
+      mov    ax,word [z]
+      div    di
+      mov    word [result],dx
+      mov    word [result+2],cx
+      mov    word [result+4],cx
+      mov    word [result+6],cx
+      jmp    @@q
+@@s1: // Long division
+      xor    si,si
+      cmp    word [z],dx
+      mov    di,word [z+2]
+      sbb    di,cx
+      mov    di,word [z+4]
+      sbb    di,bx
+      mov    di,word [z+6]
+      sbb    di,ax
+      jnc    @@n0
+      // underflow: return z
+      mov    dig,0
+      mov    dx,word [z]
+      mov    cx,word [z+2]
+      mov    bx,word [z+4]
+      mov    ax,word [z+6]
+      jmp    @@r6
+@@n0: // D1. Normalize divisor:
+      //   n := n shl lzv, so that 2^63<=n<2^64
+      mov    di,si
+      test   ax,ax
+      jnz    @@n2
+@@n1: add    si,16
+      or     ax,bx
+      mov    bx,cx
+      mov    cx,dx
+      mov    dx,di
+      jz     @@n1
+@@n2: test   ah,ah
+      jnz    @@n4
+      add    si,8
+      or     ah,al
+      mov    al,bh
+      mov    bh,bl
+      mov    bl,ch
+      mov    ch,cl
+      mov    cl,dh
+      mov    dh,dl
+      mov    dl,0
+      js     @@n5
+@@n3: inc    si
+      shl    dx,1
+      rcl    cx,1
+      rcl    bx,1
+      adc    ax,ax
+@@n4: jns    @@n3
+@@n5: mov    word [n],dx 
+      mov    word [n+2],cx
+      mov    word [n+4],bx
+      mov    word [n+6],ax
+      mov    lzv,si
+      // Adjust divident accordingly:
+      //   u := uint128(z) shl lzv; lzv=si=0..63; di=0
+      mov    dx,word [z]
+      mov    cx,word [z+2]
+      mov    bx,word [z+4]
+      mov    ax,word [z+6]
+      push   bp
+      mov    bp,si // save lzv
+      test   si,8
+      jz     @@m0
+      // << by odd-8
+      xchg   al,ah
+      mov    di,ax
+      and    di,0FFh
+      mov    al,bh
+      mov    bh,bl
+      mov    bl,ch
+      mov    ch,cl
+      mov    cl,dh
+      mov    dh,dl
+      xor    dl,dl
+@@m0: and    si,7
+      jz     @@m2
+      // << 1..7
+@@m1: shl    dx,1
+      rcl    cx,1
+      rcl    bx,1
+      rcl    ax,1
+      rcl    di,1
+      dec    si
+      jnz    @@m1
+@@m2: // si=0, bp=lzv
+      // di:ax:bx:cx:dx shifted by 0..15; 0|16|32|48 shifts remain
+      sub    bp,16
+      jc     @@m5
+      sub    bp,16
+      jc     @@m4
+      sub    bp,16
+      jc     @@m3
+      // << 48
+      pop    bp
+      mov    word [u],si
+      mov    word [u+2],si
+      mov    word [u+4],si
+      mov    word [u+6],dx
+      mov    word [u+8],cx
+      mov    word [u+10],bx
+      mov    word [u+12],ax
+      mov    word [u+14],di
+      jmp    @@m6
+@@m3: // << 32
+      pop    bp
+      mov    word [u],si
+      mov    word [u+2],si
+      mov    word [u+4],dx
+      mov    word [u+6],cx
+      mov    word [u+8],bx
+      mov    word [u+10],ax
+      mov    word [u+12],di
+      mov    word [u+14],si
+      jmp    @@m6
+@@m4: // << 16
+      pop    bp
+      mov    word [u],si
+      mov    word [u+2],dx
+      mov    word [u+4],cx
+      mov    word [u+6],bx
+      mov    word [u+8],ax
+      mov    word [u+10],di
+      mov    word [u+12],si
+      mov    word [u+14],si
+      jmp    @@m6
+@@m5: // << 0
+      pop    bp
+      mov    word [u],dx
+      mov    word [u+2],cx
+      mov    word [u+4],bx
+      mov    word [u+6],ax
+      mov    word [u+8],di
+      mov    word [u+10],si
+      mov    word [u+12],si
+      mov    word [u+14],si
+@@m6: // D2. Start from j:=3, si:=@u[j]
+      lea    si,word [u+6]
+@@d0: // D3. Estimate the next quotient digit:
+      //   q_hat := [u(j+4):u(j+3)]/[n3]
+      //   use max.possible q_hat if division overflows
+      mov    ax,-1
+      mov    dx,ss:[si+8]
+      mov    di,word [n+6]
+      cmp    dx,di
+      jnc    @@d1
+      mov    ax,ss:[si+6]
+      div    di
+@@d1: // D4. Multiply & subtract calculating partial reminder:
+      //   r := [u(j+4):u(j+3):u(j+2):u(j+1):u(j)]-q_hat*[n3:n2:n1:n0]
+      push   si    // @u[j]
+      mov    si,ax // q_hat
+      mul    word [n]
+      mov    bx,ax
+      mov    cx,dx
+      mov    ax,word [n+2]
+      mul    si
+      add    cx,ax
+      adc    dx,0
+      mov    di,dx
+      mov    ax,word [n+4]
+      mul    si
+      add    di,ax
+      adc    dx,0
+      xchg   dx,si
+      mov    ax,word [n+6]
+      mul    dx
+      add    ax,si
+      pop    si // @u[j]
+      adc    dx,0
+      sub    ss:[si],bx
+      sbb    ss:[si+2],cx
+      sbb    ss:[si+4],di
+      sbb    ss:[si+6],ax
+      mov    di,ss:[si+8]
+      sbb    di,dx
+      // D5. Test reminder
+      jnc    @@d3 // 0<=r<n
+      // D6. Add back once or twice correcting the remainder:
+      //   while (r<0) do { r+=n; }
+      mov    dx,word [n]
+      mov    cx,word [n+2]
+      mov    bx,word [n+4]
+      mov    ax,word [n+6]
+@@d2: add    ss:[si],dx
+      adc    ss:[si+2],cx
+      adc    ss:[si+4],bx
+      adc    ss:[si+6],ax
+      adc    di,0
+      jnc    @@d2
+@@d3: // D7. Loop on j--
+      dec    si
+      dec    si
+      dec    dig
+      jnz    @@d0
+      // D8. "Unnormalize" and return reminder:
+      //   result := [u3:u2:u1:u0] shr lzv
+      xor    ax,ax
+      mov    si,lzv
+      sub    si,16
+      jc     @@r2
+      sub    si,16
+      jc     @@r1
+      sub    si,16
+      jc     @@r0
+      // >> 48..63
+      mov    bx,ax
+      mov    cx,ax
+      mov    dx,word [u+6]
+      jmp    @@r3
+@@r0: // >> 32..47
+      mov    bx,ax
+      mov    cx,word [u+6]
+      mov    dx,word [u+4]
+      jmp    @@r3
+@@r1: // >> 16..31
+      mov    bx,word [u+6]
+      mov    cx,word [u+4]
+      mov    dx,word [u+2]
+      jmp    @@r3
+@@r2: // >> 0..15
+      mov    ax,word [u+6]
+      mov    bx,word [u+4]
+      mov    cx,word [u+2]
+      mov    dx,word [u]
+@@r3: and    si,15
+      sub    si,8
+      jc     @@r4
+      // >> 8..15
+      mov    dl,dh
+      mov    dh,cl
+      mov    cl,ch
+      mov    ch,bl
+      mov    bl,bh
+      mov    bh,al
+      mov    al,ah
+      xor    ah,ah
+@@r4: and    si,7
+      jz     @@r6
+      // >> 1..7
+@@r5: shr    ax,1
+      rcr    bx,1
+      rcr    cx,1
+      rcr    dx,1
+      dec    si
+      jnz    @@r5
+@@r6: mov    word [result],dx
+      mov    word [result+2],cx
+      mov    word [result+4],bx
+      mov    word [result+6],ax
+@@q:
+  end;
+  if dig<>0 then
+    HandleErrorAddrFrameInd(200,get_pc_addr,get_frame);
+end;
+