int64p.inc 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2013 by the Free Pascal development team
  4. This file contains some helper routines for int64 and qword
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. {$ifndef VER3_0}
  12. {$define FPC_SYSTEM_HAS_MUL_QWORD}
  13. function fpc_mul_qword( f1, f2: qword): qword; [public,alias: 'FPC_MUL_QWORD']; compilerproc;
  14. begin
  15. { routine contributed by Max Nazhalov
  16. 64-bit multiplication via 16-bit digits: (A3:A2:A1:A0)*(B3:B2:B1:B0)
  17. //////// STEP 1; break-down to 32-bit multiplications, each of them generates 64-bit result:
  18. (A3:A2*B3:B2)<<64 + (A3:A2*B1:B0)<<32 + (A1:A0*B3:B2)<<32 + (A1:A0*B1:B0)
  19. (A1:A0*B1:B0) = (A1*B1)<<32 + (A1*B0)<<16 + (A0*B1)<<16 + (A0:B0)
  20. -- never overflows, forms the base of the final result, name it as "R64"
  21. (A3:A2*B3:B2) is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width.
  22. -- always overflows if "<>0", so can be checked as "((A2|A3)<>0)&&(B2|B3)<>0)"
  23. (A3:A2*B1:B0) and (A1:A0*B3:B2) are partially required for the final result
  24. -- to be calculated on steps 2 and 3 as a correction for the "R64"
  25. //////// STEP 2; calculate "R64+=(A3:A2*B1:B0)<<32" (16-bit multiplications, each of them generates 32-bit result):
  26. (A3*B1)<<32 + (A3*B0)<<16 + (A2*B1)<<16 + (A2*B0)
  27. ((A3*B1)<<32)<<32 is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width.
  28. -- always overflows if "<>0", so can be checked as "(A3<>0)&&(B1<>0)"
  29. ((A3*B0)<<16)<<32: only low word of "A3*B0" contributes to the final result if overflow is not checked.
  30. -- overflows if the hi_word "<>0"
  31. -- overflows if R64+(lo_word<<48) produces C-flag
  32. ((A2*B1)<<16)<<32: only low word of "A2*B1" contributes to the final result if overflow is not checked.
  33. -- overflows if the hi_word "<>0"
  34. -- overflows if R64+(lo_word<<48) produces C-flag
  35. (A2*B0)<<32: the whole dword is significand, name it as "X"
  36. -- overflows if R64+(X<<32) produces C-flag
  37. //////// STEP 3; calculate "R64+=(A1:A0*B3:B2)<<32" (16-bit multiplications, each of them generates 32-bit result):
  38. (A1*B3)<<32 + (A1*B2)<<16 + (A0*B3)<<16 + (A0*B2)
  39. ((A1*B3)<<32)<<32 is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width.
  40. -- always overflows if "<>0", so can be checked as "(A1<>0)&&(B3<>0)"
  41. ((A1*B2)<<16)<<32: only low word of "A1*B2" contributes to the final result if overflow is not checked.
  42. -- overflows if the hi_word "<>0"
  43. -- overflows if R64+(lo_word<<48) produces C-flag
  44. ((A0*B3)<<16)<<32: only low word "A0*B3" contributes to the final result if overflow is not checked.
  45. -- overflows if the hi_word "<>0"
  46. -- overflows if R64+(lo_word<<48) produces C-flag
  47. (A0*B2)<<32: the whole dword is significand, name it as "Y"
  48. -- overflows if R64+(Y<<32) produces C-flag
  49. }
  50. asm
  51. mov di,word[f1]
  52. mov bx,word[f1+2]
  53. mov si,word[f2]
  54. mov ax,word[f2+2]
  55. push bp
  56. mov cx,ax
  57. mul bx
  58. xchg ax,bx
  59. mov bp,dx
  60. mul si
  61. xchg ax,cx
  62. add bx,dx
  63. adc bp,0
  64. mul di
  65. add cx,ax
  66. adc bx,dx
  67. adc bp,0
  68. mov ax,di
  69. mul si
  70. add cx,dx
  71. adc bx,0
  72. adc bp,0
  73. mov dx,bp
  74. pop bp
  75. mov word[result],ax
  76. mov word[result+2],cx
  77. mov word[result+4],bx
  78. mov word[result+6],dx
  79. mov si,word[f1+4]
  80. mov ax,word[f1+6]
  81. mov di,word[f2]
  82. mul di
  83. mov cx,ax
  84. mov ax,word[f2+2]
  85. mul si
  86. add cx,ax
  87. mov ax,di
  88. mul si
  89. mov bx,ax
  90. add cx,dx
  91. mov si,word[f2+4]
  92. mov ax,word[f2+6]
  93. mov di,word[f1]
  94. mul di
  95. add cx,ax
  96. mov ax,word[f1+2]
  97. mul si
  98. add cx,ax
  99. mov ax,di
  100. mul si
  101. add bx,ax
  102. adc cx,dx
  103. add word[result+4],bx
  104. adc word[result+6],cx
  105. end [ 'ax','bx','cx','dx','si','di' ];
  106. end;
  107. function fpc_mul_qword_checkoverflow( f1, f2: qword): qword; [public,alias: 'FPC_MUL_QWORD_CHECKOVERFLOW']; compilerproc;
  108. begin
  109. { routine contributed by Max Nazhalov
  110. 64-bit multiplication via 16-bit digits: (A3:A2:A1:A0)*(B3:B2:B1:B0)
  111. //////// STEP 1; break-down to 32-bit multiplications, each of them generates 64-bit result:
  112. (A3:A2*B3:B2)<<64 + (A3:A2*B1:B0)<<32 + (A1:A0*B3:B2)<<32 + (A1:A0*B1:B0)
  113. (A1:A0*B1:B0) = (A1*B1)<<32 + (A1*B0)<<16 + (A0*B1)<<16 + (A0:B0)
  114. -- never overflows, forms the base of the final result, name it as "R64"
  115. (A3:A2*B3:B2) is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width.
  116. -- always overflows if "<>0", so can be checked as "((A2|A3)<>0)&&(B2|B3)<>0)"
  117. (A3:A2*B1:B0) and (A1:A0*B3:B2) are partially required for the final result
  118. -- to be calculated on steps 2 and 3 as a correction for the "R64"
  119. //////// STEP 2; calculate "R64+=(A3:A2*B1:B0)<<32" (16-bit multiplications, each of them generates 32-bit result):
  120. (A3*B1)<<32 + (A3*B0)<<16 + (A2*B1)<<16 + (A2*B0)
  121. ((A3*B1)<<32)<<32 is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width.
  122. -- always overflows if "<>0", so can be checked as "(A3<>0)&&(B1<>0)"
  123. ((A3*B0)<<16)<<32: only low word of "A3*B0" contributes to the final result if overflow is not checked.
  124. -- overflows if the hi_word "<>0"
  125. -- overflows if R64+(lo_word<<48) produces C-flag
  126. ((A2*B1)<<16)<<32: only low word of "A2*B1" contributes to the final result if overflow is not checked.
  127. -- overflows if the hi_word "<>0"
  128. -- overflows if R64+(lo_word<<48) produces C-flag
  129. (A2*B0)<<32: the whole dword is significand, name it as "X"
  130. -- overflows if R64+(X<<32) produces C-flag
  131. //////// STEP 3; calculate "R64+=(A1:A0*B3:B2)<<32" (16-bit multiplications, each of them generates 32-bit result):
  132. (A1*B3)<<32 + (A1*B2)<<16 + (A0*B3)<<16 + (A0*B2)
  133. ((A1*B3)<<32)<<32 is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width.
  134. -- always overflows if "<>0", so can be checked as "(A1<>0)&&(B3<>0)"
  135. ((A1*B2)<<16)<<32: only low word of "A1*B2" contributes to the final result if overflow is not checked.
  136. -- overflows if the hi_word "<>0"
  137. -- overflows if R64+(lo_word<<48) produces C-flag
  138. ((A0*B3)<<16)<<32: only low word "A0*B3" contributes to the final result if overflow is not checked.
  139. -- overflows if the hi_word "<>0"
  140. -- overflows if R64+(lo_word<<48) produces C-flag
  141. (A0*B2)<<32: the whole dword is significand, name it as "Y"
  142. -- overflows if R64+(Y<<32) produces C-flag
  143. }
  144. asm
  145. mov di,word[f1]
  146. mov bx,word[f1+2]
  147. mov si,word[f2]
  148. mov ax,word[f2+2]
  149. push bp
  150. mov cx,ax
  151. mul bx
  152. xchg ax,bx
  153. mov bp,dx
  154. mul si
  155. xchg ax,cx
  156. add bx,dx
  157. adc bp,0
  158. mul di
  159. add cx,ax
  160. adc bx,dx
  161. adc bp,0
  162. mov ax,di
  163. mul si
  164. add cx,dx
  165. adc bx,0
  166. adc bp,0
  167. mov dx,bp
  168. pop bp
  169. mov word[result],ax
  170. mov word[result+2],cx
  171. mov word[result+4],bx
  172. mov word[result+6],dx
  173. mov si,word[f1+4]
  174. mov ax,word[f1+6]
  175. mov bx,word[f2+6]
  176. mov cx,ax
  177. or cx,si
  178. jz @@nover1
  179. mov cx,word[f2+4]
  180. or cx,bx
  181. jnz @@overflow
  182. @@nover1:
  183. test bx,bx
  184. jz @@nover2
  185. mov bx,word[f1+2]
  186. test bx,bx
  187. jnz @@overflow
  188. @@nover2:
  189. test ax,ax
  190. jz @@nover3
  191. or bx,word[f2+2]
  192. jnz @@overflow
  193. @@nover3:
  194. mov di,word[f2]
  195. mul di
  196. test dx,dx
  197. jnz @@overflow
  198. mov cx,ax
  199. mov ax,word[f2+2]
  200. mul si
  201. test dx,dx
  202. jnz @@overflow
  203. add cx,ax
  204. jc @@overflow
  205. mov ax,di
  206. mul si
  207. mov bx,ax
  208. add cx,dx
  209. jc @@overflow
  210. mov si,word[f2+4]
  211. mov ax,word[f2+6]
  212. mov di,word[f1]
  213. mul di
  214. test dx,dx
  215. jnz @@overflow
  216. add cx,ax
  217. jc @@overflow
  218. mov ax,word[f1+2]
  219. mul si
  220. test dx,dx
  221. jnz @@overflow
  222. add cx,ax
  223. jc @@overflow
  224. mov ax,di
  225. mul si
  226. add bx,ax
  227. adc cx,dx
  228. jc @@overflow
  229. add word[result+4],bx
  230. adc word[result+6],cx
  231. jnc @@done
  232. @@overflow:
  233. call FPC_OVERFLOW
  234. @@done:
  235. end [ 'ax','bx','cx','dx','si','di' ];
  236. end;
  237. {$endif VER3_0}
  238. {$define FPC_SYSTEM_HAS_MUL_DWORD_TO_QWORD}
  239. function fpc_mul_dword_to_qword(f1,f2 : dword) : qword;[public,alias: 'FPC_MUL_DWORD_TO_QWORD']; compilerproc; assembler; nostackframe;
  240. asm
  241. push bp
  242. mov bp, sp
  243. mov di,word[bp + 8 + extra_param_offset] // word[f1]
  244. mov bx,word[bp + 10 + extra_param_offset] // word[f1+2]
  245. mov si,word[bp + 4 + extra_param_offset] // word[f2]
  246. mov ax,word[bp + 6 + extra_param_offset] // word[f2+2]
  247. mov cx,ax
  248. mul bx
  249. xchg ax,bx
  250. mov bp,dx
  251. mul si
  252. xchg ax,cx
  253. add bx,dx
  254. adc bp,0
  255. mul di
  256. add cx,ax
  257. adc bx,dx
  258. adc bp,0
  259. mov ax,di
  260. mul si
  261. add cx,dx
  262. adc bx,0
  263. adc bp,0
  264. mov dx,ax
  265. mov ax,bp
  266. pop bp
  267. end;
  268. {$define FPC_SYSTEM_HAS_DIV_QWORD}
  269. function fpc_div_qword( n, z: qword ): qword; [public, alias:'FPC_DIV_QWORD']; compilerproc;
  270. // Generic "schoolbook" division algorithm
  271. // see [D.Knuth, TAOCP, vol.2, sect.4.3.1] for explanation
  272. var
  273. dig: byte;
  274. u: array [0..6] of word;
  275. begin
  276. asm
  277. mov dig,3 // quotient contains 3 digits for "long" division path
  278. // Check parameters
  279. mov dx,word [n]
  280. mov cx,word [n+2]
  281. mov bx,word [n+4]
  282. mov ax,word [n+6]
  283. mov di,ax
  284. or di,bx
  285. or di,cx
  286. jnz @@s1
  287. or di,dx
  288. jz @@q // div by 0
  289. // Short division
  290. mov dig,al
  291. mov dx,word [z+6]
  292. cmp dx,di
  293. jc @@s0
  294. xchg ax,dx
  295. div di
  296. @@s0: mov word [result+6],ax
  297. mov ax,word [z+4]
  298. div di
  299. mov word [result+4],ax
  300. mov ax,word [z+2]
  301. div di
  302. mov word [result+2],ax
  303. mov ax,word [z]
  304. div di
  305. mov word [result],ax
  306. jmp @@q
  307. @@s1: // Long division
  308. xor si,si
  309. cmp word [z],dx
  310. mov di,word [z+2]
  311. sbb di,cx
  312. mov di,word [z+4]
  313. sbb di,bx
  314. mov di,word [z+6]
  315. sbb di,ax
  316. jnc @@n0
  317. // underflow: return 0
  318. mov dig,0
  319. mov word [result],si
  320. mov word [result+2],si
  321. mov word [result+4],si
  322. mov word [result+6],si
  323. jmp @@q
  324. @@n0: // D1. Normalize divisor:
  325. // n := n shl lzv, so that 2^63<=n<2^64
  326. // Note: n>=0x10000 leads to lzv<=47 and q3=0
  327. mov word [result+6],si // q3:=0
  328. mov di,si
  329. test ax,ax
  330. jnz @@n2
  331. @@n1: add si,16
  332. or ax,bx
  333. mov bx,cx
  334. mov cx,dx
  335. mov dx,di
  336. jz @@n1
  337. @@n2: test ah,ah
  338. jnz @@n4
  339. add si,8
  340. or ah,al
  341. mov al,bh
  342. mov bh,bl
  343. mov bl,ch
  344. mov ch,cl
  345. mov cl,dh
  346. mov dh,dl
  347. mov dl,0
  348. js @@n5
  349. @@n3: inc si
  350. shl dx,1
  351. rcl cx,1
  352. rcl bx,1
  353. adc ax,ax
  354. @@n4: jns @@n3
  355. @@n5: mov word [n],dx
  356. mov word [n+2],cx
  357. mov word [n+4],bx
  358. mov word [n+6],ax
  359. // Adjust divident accordingly:
  360. // u := uint128(z) shl lzv; lzv=si=0..47; di=0
  361. mov dx,word [z]
  362. mov cx,word [z+2]
  363. mov bx,word [z+4]
  364. mov ax,word [z+6]
  365. push bp
  366. mov bp,si // save lzv
  367. test si,8
  368. jz @@m0
  369. // << by odd-8
  370. xchg al,ah
  371. mov di,ax
  372. and di,0FFh
  373. mov al,bh
  374. mov bh,bl
  375. mov bl,ch
  376. mov ch,cl
  377. mov cl,dh
  378. mov dh,dl
  379. xor dl,dl
  380. @@m0: and si,7
  381. jz @@m2
  382. // << 1..7
  383. @@m1: shl dx,1
  384. rcl cx,1
  385. rcl bx,1
  386. rcl ax,1
  387. rcl di,1
  388. dec si
  389. jnz @@m1
  390. @@m2: // si=0, bp=lzv
  391. // di:ax:bx:cx:dx shifted by 0..15; 0|16|32 shifts remain
  392. sub bp,16
  393. jc @@m5
  394. sub bp,16
  395. jc @@m4
  396. // << 32
  397. pop bp
  398. mov word [u],si
  399. mov word [u+2],si
  400. mov word [u+4],dx
  401. mov word [u+6],cx
  402. mov word [u+8],bx
  403. mov word [u+10],ax
  404. mov word [u+12],di
  405. jmp @@m6
  406. @@m4: // << 16
  407. pop bp
  408. mov word [u],si
  409. mov word [u+2],dx
  410. mov word [u+4],cx
  411. mov word [u+6],bx
  412. mov word [u+8],ax
  413. mov word [u+10],di
  414. mov word [u+12],si
  415. jmp @@m6
  416. @@m5: // << 0
  417. pop bp
  418. mov word [u],dx
  419. mov word [u+2],cx
  420. mov word [u+4],bx
  421. mov word [u+6],ax
  422. mov word [u+8],di
  423. mov word [u+10],si
  424. mov word [u+12],si
  425. @@m6: // D2. Start from j:=2 (since u7=0 and u6<n3), si:=@u[j], bx:=@q[j]
  426. lea si,word [u+4]
  427. lea bx,word [result+4]
  428. @@d0: push bx
  429. // D3. Estimate the next quotient digit:
  430. // q_hat := [u(j+4):u(j+3)]/[n3]
  431. // use max.possible q_hat if division overflows
  432. mov ax,-1
  433. mov dx,ss:[si+8]
  434. mov di,word [n+6]
  435. cmp dx,di
  436. jnc @@d1
  437. mov ax,ss:[si+6]
  438. div di
  439. @@d1: // D4. Multiply & subtract calculating partial reminder:
  440. // r := [u(j+4):u(j+3):u(j+2):u(j+1):u(j)]-q_hat*[n3:n2:n1:n0]
  441. push ax // q_hat
  442. push si // @u[j]
  443. mov si,ax
  444. mul word [n]
  445. mov bx,ax
  446. mov cx,dx
  447. mov ax,word [n+2]
  448. mul si
  449. add cx,ax
  450. adc dx,0
  451. mov di,dx
  452. mov ax,word [n+4]
  453. mul si
  454. add di,ax
  455. adc dx,0
  456. xchg dx,si
  457. mov ax,word [n+6]
  458. mul dx
  459. add ax,si
  460. pop si // @u[j]
  461. adc dx,0
  462. sub ss:[si],bx
  463. sbb ss:[si+2],cx
  464. sbb ss:[si+4],di
  465. sbb ss:[si+6],ax
  466. sbb ss:[si+8],dx
  467. pop di // q_hat
  468. // D5. Test reminder
  469. jnc @@d3 // 0<=r<n
  470. // D6. Add back once or twice correcting the quotient and remainder:
  471. // while (r<0) do { q_hat--; r+=n; }
  472. mov dx,word [n]
  473. mov cx,word [n+2]
  474. mov bx,word [n+4]
  475. mov ax,word [n+6]
  476. @@d2: dec di
  477. add ss:[si],dx
  478. adc ss:[si+2],cx
  479. adc ss:[si+4],bx
  480. adc ss:[si+6],ax
  481. adc word ss:[si+8],0
  482. jnc @@d2
  483. @@d3: // D7. Store q[j], loop on j--
  484. pop bx // @q[j]
  485. dec si
  486. dec si
  487. mov ss:[bx],di
  488. dec bx
  489. dec bx
  490. dec dig
  491. jnz @@d0
  492. @@q:
  493. end;
  494. if dig<>0 then
  495. HandleErrorAddrFrameInd(200,get_pc_addr,get_frame);
  496. end;
  497. {$define FPC_SYSTEM_HAS_MOD_QWORD}
  498. function fpc_mod_qword( n, z: qword ): qword; [public, alias:'FPC_MOD_QWORD']; compilerproc;
  499. // Generic "schoolbook" division algorithm
  500. // see [D.Knuth, TAOCP, vol.2, sect.4.3.1] for explanation
  501. var
  502. dig: byte;
  503. lzv: word;
  504. u: array [0..6] of word;
  505. begin
  506. asm
  507. mov dig,3 // quotient contains 3 digist for "long" division path
  508. // Check parameters
  509. mov dx,word [n]
  510. mov cx,word [n+2]
  511. mov bx,word [n+4]
  512. mov ax,word [n+6]
  513. mov di,ax
  514. or di,bx
  515. or di,cx
  516. jnz @@s1
  517. or di,dx
  518. jz @@q // div by 0
  519. // Short division
  520. mov dig,al
  521. mov dx,word [z+6]
  522. cmp dx,di
  523. jc @@s0
  524. xchg ax,dx
  525. div di
  526. @@s0: mov ax,word [z+4]
  527. div di
  528. mov ax,word [z+2]
  529. div di
  530. mov ax,word [z]
  531. div di
  532. mov word [result],dx
  533. mov word [result+2],cx
  534. mov word [result+4],cx
  535. mov word [result+6],cx
  536. jmp @@q
  537. @@s1: // Long division
  538. xor si,si
  539. cmp word [z],dx
  540. mov di,word [z+2]
  541. sbb di,cx
  542. mov di,word [z+4]
  543. sbb di,bx
  544. mov di,word [z+6]
  545. sbb di,ax
  546. jnc @@n0
  547. // underflow: return z
  548. mov dig,0
  549. mov dx,word [z]
  550. mov cx,word [z+2]
  551. mov bx,word [z+4]
  552. mov ax,word [z+6]
  553. jmp @@r6
  554. @@n0: // D1. Normalize divisor:
  555. // n := n shl lzv, so that 2^63<=n<2^64
  556. // Note: n>=0x10000 leads to lzv<=47
  557. mov di,si
  558. test ax,ax
  559. jnz @@n2
  560. @@n1: add si,16
  561. or ax,bx
  562. mov bx,cx
  563. mov cx,dx
  564. mov dx,di
  565. jz @@n1
  566. @@n2: test ah,ah
  567. jnz @@n4
  568. add si,8
  569. or ah,al
  570. mov al,bh
  571. mov bh,bl
  572. mov bl,ch
  573. mov ch,cl
  574. mov cl,dh
  575. mov dh,dl
  576. mov dl,0
  577. js @@n5
  578. @@n3: inc si
  579. shl dx,1
  580. rcl cx,1
  581. rcl bx,1
  582. adc ax,ax
  583. @@n4: jns @@n3
  584. @@n5: mov word [n],dx
  585. mov word [n+2],cx
  586. mov word [n+4],bx
  587. mov word [n+6],ax
  588. mov lzv,si
  589. // Adjust divident accordingly:
  590. // u := uint128(z) shl lzv; lzv=si=0..47; di=0
  591. mov dx,word [z]
  592. mov cx,word [z+2]
  593. mov bx,word [z+4]
  594. mov ax,word [z+6]
  595. push bp
  596. mov bp,si // save lzv
  597. test si,8
  598. jz @@m0
  599. // << by odd-8
  600. xchg al,ah
  601. mov di,ax
  602. and di,0FFh
  603. mov al,bh
  604. mov bh,bl
  605. mov bl,ch
  606. mov ch,cl
  607. mov cl,dh
  608. mov dh,dl
  609. xor dl,dl
  610. @@m0: and si,7
  611. jz @@m2
  612. // << 1..7
  613. @@m1: shl dx,1
  614. rcl cx,1
  615. rcl bx,1
  616. rcl ax,1
  617. rcl di,1
  618. dec si
  619. jnz @@m1
  620. @@m2: // si=0, bp=lzv
  621. // di:ax:bx:cx:dx shifted by 0..15; 0|16|32 shifts remain
  622. sub bp,16
  623. jc @@m5
  624. sub bp,16
  625. jc @@m4
  626. // << 32
  627. pop bp
  628. mov word [u],si
  629. mov word [u+2],si
  630. mov word [u+4],dx
  631. mov word [u+6],cx
  632. mov word [u+8],bx
  633. mov word [u+10],ax
  634. mov word [u+12],di
  635. jmp @@m6
  636. @@m4: // << 16
  637. pop bp
  638. mov word [u],si
  639. mov word [u+2],dx
  640. mov word [u+4],cx
  641. mov word [u+6],bx
  642. mov word [u+8],ax
  643. mov word [u+10],di
  644. mov word [u+12],si
  645. jmp @@m6
  646. @@m5: // << 0
  647. pop bp
  648. mov word [u],dx
  649. mov word [u+2],cx
  650. mov word [u+4],bx
  651. mov word [u+6],ax
  652. mov word [u+8],di
  653. mov word [u+10],si
  654. mov word [u+12],si
  655. @@m6: // D2. Start from j:=2 (since u7=0 and u6<n3), si:=@u[j]
  656. lea si,word [u+4]
  657. @@d0: // D3. Estimate the next quotient digit:
  658. // q_hat := [u(j+4):u(j+3)]/[n3]
  659. // use max.possible q_hat if division overflows
  660. mov ax,-1
  661. mov dx,ss:[si+8]
  662. mov di,word [n+6]
  663. cmp dx,di
  664. jnc @@d1
  665. mov ax,ss:[si+6]
  666. div di
  667. @@d1: // D4. Multiply & subtract calculating partial reminder:
  668. // r := [u(j+4):u(j+3):u(j+2):u(j+1):u(j)]-q_hat*[n3:n2:n1:n0]
  669. push si // @u[j]
  670. mov si,ax // q_hat
  671. mul word [n]
  672. mov bx,ax
  673. mov cx,dx
  674. mov ax,word [n+2]
  675. mul si
  676. add cx,ax
  677. adc dx,0
  678. mov di,dx
  679. mov ax,word [n+4]
  680. mul si
  681. add di,ax
  682. adc dx,0
  683. xchg dx,si
  684. mov ax,word [n+6]
  685. mul dx
  686. add ax,si
  687. pop si // @u[j]
  688. adc dx,0
  689. sub ss:[si],bx
  690. sbb ss:[si+2],cx
  691. sbb ss:[si+4],di
  692. sbb ss:[si+6],ax
  693. mov di,ss:[si+8]
  694. sbb di,dx
  695. // D5. Test reminder
  696. jnc @@d3 // 0<=r<n
  697. // D6. Add back once or twice correcting the remainder:
  698. // while (r<0) do { r+=n; }
  699. mov dx,word [n]
  700. mov cx,word [n+2]
  701. mov bx,word [n+4]
  702. mov ax,word [n+6]
  703. @@d2: add ss:[si],dx
  704. adc ss:[si+2],cx
  705. adc ss:[si+4],bx
  706. adc ss:[si+6],ax
  707. adc di,0
  708. jnc @@d2
  709. @@d3: // D7. Loop on j--
  710. dec si
  711. dec si
  712. dec dig
  713. jnz @@d0
  714. // D8. "Unnormalize" and return reminder:
  715. // result := [u3:u2:u1:u0] shr lzv
  716. xor ax,ax
  717. mov si,lzv
  718. sub si,16
  719. jc @@r2
  720. sub si,16
  721. jc @@r1
  722. // >> 32..47
  723. mov bx,ax
  724. mov cx,word [u+6]
  725. mov dx,word [u+4]
  726. jmp @@r3
  727. @@r1: // >> 16..31
  728. mov bx,word [u+6]
  729. mov cx,word [u+4]
  730. mov dx,word [u+2]
  731. jmp @@r3
  732. @@r2: // >> 0..15
  733. mov ax,word [u+6]
  734. mov bx,word [u+4]
  735. mov cx,word [u+2]
  736. mov dx,word [u]
  737. @@r3: and si,15
  738. sub si,8
  739. jc @@r4
  740. // >> 8..15
  741. mov dl,dh
  742. mov dh,cl
  743. mov cl,ch
  744. mov ch,bl
  745. mov bl,bh
  746. mov bh,al
  747. mov al,ah
  748. xor ah,ah
  749. @@r4: and si,7
  750. jz @@r6
  751. // >> 1..7
  752. @@r5: shr ax,1
  753. rcr bx,1
  754. rcr cx,1
  755. rcr dx,1
  756. dec si
  757. jnz @@r5
  758. @@r6: mov word [result],dx
  759. mov word [result+2],cx
  760. mov word [result+4],bx
  761. mov word [result+6],ax
  762. @@q:
  763. end;
  764. if dig<>0 then
  765. HandleErrorAddrFrameInd(200,get_pc_addr,get_frame);
  766. end;