int64p.inc 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2013 by the Free Pascal development team
  4. This file contains some helper routines for int64 and qword
  5. See the file COPYING.FPC, included in this distribution,
  6. for details about the copyright.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  10. **********************************************************************}
  11. {$define FPC_SYSTEM_HAS_MUL_QWORD}
  12. function fpc_mul_qword( f1, f2: qword): qword; [public,alias: 'FPC_MUL_QWORD']; compilerproc;
  13. begin
  14. { routine contributed by Max Nazhalov
  15. 64-bit multiplication via 16-bit digits: (A3:A2:A1:A0)*(B3:B2:B1:B0)
  16. //////// STEP 1; break-down to 32-bit multiplications, each of them generates 64-bit result:
  17. (A3:A2*B3:B2)<<64 + (A3:A2*B1:B0)<<32 + (A1:A0*B3:B2)<<32 + (A1:A0*B1:B0)
  18. (A1:A0*B1:B0) = (A1*B1)<<32 + (A1*B0)<<16 + (A0*B1)<<16 + (A0:B0)
  19. -- never overflows, forms the base of the final result, name it as "R64"
  20. (A3:A2*B3:B2) is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width.
  21. -- always overflows if "<>0", so can be checked as "((A2|A3)<>0)&&(B2|B3)<>0)"
  22. (A3:A2*B1:B0) and (A1:A0*B3:B2) are partially required for the final result
  23. -- to be calculated on steps 2 and 3 as a correction for the "R64"
  24. //////// STEP 2; calculate "R64+=(A3:A2*B1:B0)<<32" (16-bit multiplications, each of them generates 32-bit result):
  25. (A3*B1)<<32 + (A3*B0)<<16 + (A2*B1)<<16 + (A2*B0)
  26. ((A3*B1)<<32)<<32 is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width.
  27. -- always overflows if "<>0", so can be checked as "(A3<>0)&&(B1<>0)"
  28. ((A3*B0)<<16)<<32: only low word of "A3*B0" contributes to the final result if overflow is not checked.
  29. -- overflows if the hi_word "<>0"
  30. -- overflows if R64+(lo_word<<48) produces C-flag
  31. ((A2*B1)<<16)<<32: only low word of "A2*B1" contributes to the final result if overflow is not checked.
  32. -- overflows if the hi_word "<>0"
  33. -- overflows if R64+(lo_word<<48) produces C-flag
  34. (A2*B0)<<32: the whole dword is significand, name it as "X"
  35. -- overflows if R64+(X<<32) produces C-flag
  36. //////// STEP 3; calculate "R64+=(A1:A0*B3:B2)<<32" (16-bit multiplications, each of them generates 32-bit result):
  37. (A1*B3)<<32 + (A1*B2)<<16 + (A0*B3)<<16 + (A0*B2)
  38. ((A1*B3)<<32)<<32 is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width.
  39. -- always overflows if "<>0", so can be checked as "(A1<>0)&&(B3<>0)"
  40. ((A1*B2)<<16)<<32: only low word of "A1*B2" contributes to the final result if overflow is not checked.
  41. -- overflows if the hi_word "<>0"
  42. -- overflows if R64+(lo_word<<48) produces C-flag
  43. ((A0*B3)<<16)<<32: only low word "A0*B3" contributes to the final result if overflow is not checked.
  44. -- overflows if the hi_word "<>0"
  45. -- overflows if R64+(lo_word<<48) produces C-flag
  46. (A0*B2)<<32: the whole dword is significand, name it as "Y"
  47. -- overflows if R64+(Y<<32) produces C-flag
  48. }
  49. asm
  50. mov di,word[f1]
  51. mov bx,word[f1+2]
  52. mov si,word[f2]
  53. mov ax,word[f2+2]
  54. push bp
  55. mov cx,ax
  56. mul bx
  57. xchg ax,bx
  58. mov bp,dx
  59. mul si
  60. xchg ax,cx
  61. add bx,dx
  62. adc bp,0
  63. mul di
  64. add cx,ax
  65. adc bx,dx
  66. adc bp,0
  67. mov ax,di
  68. mul si
  69. add cx,dx
  70. adc bx,0
  71. adc bp,0
  72. mov dx,bp
  73. pop bp
  74. mov word[result],ax
  75. mov word[result+2],cx
  76. mov word[result+4],bx
  77. mov word[result+6],dx
  78. mov si,word[f1+4]
  79. mov ax,word[f1+6]
  80. mov di,word[f2]
  81. mul di
  82. mov cx,ax
  83. mov ax,word[f2+2]
  84. mul si
  85. add cx,ax
  86. mov ax,di
  87. mul si
  88. mov bx,ax
  89. add cx,dx
  90. mov si,word[f2+4]
  91. mov ax,word[f2+6]
  92. mov di,word[f1]
  93. mul di
  94. add cx,ax
  95. mov ax,word[f1+2]
  96. mul si
  97. add cx,ax
  98. mov ax,di
  99. mul si
  100. add bx,ax
  101. adc cx,dx
  102. add word[result+4],bx
  103. adc word[result+6],cx
  104. end [ 'ax','bx','cx','dx','si','di' ];
  105. end;
  106. function fpc_mul_qword_checkoverflow( f1, f2: qword): qword; [public,alias: 'FPC_MUL_QWORD_CHECKOVERFLOW']; compilerproc;
  107. begin
  108. { routine contributed by Max Nazhalov
  109. 64-bit multiplication via 16-bit digits: (A3:A2:A1:A0)*(B3:B2:B1:B0)
  110. //////// STEP 1; break-down to 32-bit multiplications, each of them generates 64-bit result:
  111. (A3:A2*B3:B2)<<64 + (A3:A2*B1:B0)<<32 + (A1:A0*B3:B2)<<32 + (A1:A0*B1:B0)
  112. (A1:A0*B1:B0) = (A1*B1)<<32 + (A1*B0)<<16 + (A0*B1)<<16 + (A0:B0)
  113. -- never overflows, forms the base of the final result, name it as "R64"
  114. (A3:A2*B3:B2) is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width.
  115. -- always overflows if "<>0", so can be checked as "((A2|A3)<>0)&&(B2|B3)<>0)"
  116. (A3:A2*B1:B0) and (A1:A0*B3:B2) are partially required for the final result
  117. -- to be calculated on steps 2 and 3 as a correction for the "R64"
  118. //////// STEP 2; calculate "R64+=(A3:A2*B1:B0)<<32" (16-bit multiplications, each of them generates 32-bit result):
  119. (A3*B1)<<32 + (A3*B0)<<16 + (A2*B1)<<16 + (A2*B0)
  120. ((A3*B1)<<32)<<32 is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width.
  121. -- always overflows if "<>0", so can be checked as "(A3<>0)&&(B1<>0)"
  122. ((A3*B0)<<16)<<32: only low word of "A3*B0" contributes to the final result if overflow is not checked.
  123. -- overflows if the hi_word "<>0"
  124. -- overflows if R64+(lo_word<<48) produces C-flag
  125. ((A2*B1)<<16)<<32: only low word of "A2*B1" contributes to the final result if overflow is not checked.
  126. -- overflows if the hi_word "<>0"
  127. -- overflows if R64+(lo_word<<48) produces C-flag
  128. (A2*B0)<<32: the whole dword is significand, name it as "X"
  129. -- overflows if R64+(X<<32) produces C-flag
  130. //////// STEP 3; calculate "R64+=(A1:A0*B3:B2)<<32" (16-bit multiplications, each of them generates 32-bit result):
  131. (A1*B3)<<32 + (A1*B2)<<16 + (A0*B3)<<16 + (A0*B2)
  132. ((A1*B3)<<32)<<32 is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width.
  133. -- always overflows if "<>0", so can be checked as "(A1<>0)&&(B3<>0)"
  134. ((A1*B2)<<16)<<32: only low word of "A1*B2" contributes to the final result if overflow is not checked.
  135. -- overflows if the hi_word "<>0"
  136. -- overflows if R64+(lo_word<<48) produces C-flag
  137. ((A0*B3)<<16)<<32: only low word "A0*B3" contributes to the final result if overflow is not checked.
  138. -- overflows if the hi_word "<>0"
  139. -- overflows if R64+(lo_word<<48) produces C-flag
  140. (A0*B2)<<32: the whole dword is significand, name it as "Y"
  141. -- overflows if R64+(Y<<32) produces C-flag
  142. }
  143. asm
  144. mov di,word[f1]
  145. mov bx,word[f1+2]
  146. mov si,word[f2]
  147. mov ax,word[f2+2]
  148. push bp
  149. mov cx,ax
  150. mul bx
  151. xchg ax,bx
  152. mov bp,dx
  153. mul si
  154. xchg ax,cx
  155. add bx,dx
  156. adc bp,0
  157. mul di
  158. add cx,ax
  159. adc bx,dx
  160. adc bp,0
  161. mov ax,di
  162. mul si
  163. add cx,dx
  164. adc bx,0
  165. adc bp,0
  166. mov dx,bp
  167. pop bp
  168. mov word[result],ax
  169. mov word[result+2],cx
  170. mov word[result+4],bx
  171. mov word[result+6],dx
  172. mov si,word[f1+4]
  173. mov ax,word[f1+6]
  174. mov bx,word[f2+6]
  175. mov cx,ax
  176. or cx,si
  177. jz @@nover1
  178. mov cx,word[f2+4]
  179. or cx,bx
  180. jnz @@overflow
  181. @@nover1:
  182. test bx,bx
  183. jz @@nover2
  184. mov bx,word[f1+2]
  185. test bx,bx
  186. jnz @@overflow
  187. @@nover2:
  188. test ax,ax
  189. jz @@nover3
  190. or bx,word[f2+2]
  191. jnz @@overflow
  192. @@nover3:
  193. mov di,word[f2]
  194. mul di
  195. test dx,dx
  196. jnz @@overflow
  197. mov cx,ax
  198. mov ax,word[f2+2]
  199. mul si
  200. test dx,dx
  201. jnz @@overflow
  202. add cx,ax
  203. jc @@overflow
  204. mov ax,di
  205. mul si
  206. mov bx,ax
  207. add cx,dx
  208. jc @@overflow
  209. mov si,word[f2+4]
  210. mov ax,word[f2+6]
  211. mov di,word[f1]
  212. mul di
  213. test dx,dx
  214. jnz @@overflow
  215. add cx,ax
  216. jc @@overflow
  217. mov ax,word[f1+2]
  218. mul si
  219. test dx,dx
  220. jnz @@overflow
  221. add cx,ax
  222. jc @@overflow
  223. mov ax,di
  224. mul si
  225. add bx,ax
  226. adc cx,dx
  227. jc @@overflow
  228. add word[result+4],bx
  229. adc word[result+6],cx
  230. jnc @@done
  231. @@overflow:
  232. call FPC_OVERFLOW
  233. @@done:
  234. end [ 'ax','bx','cx','dx','si','di' ];
  235. end;
  236. {$define FPC_SYSTEM_HAS_MUL_DWORD_TO_QWORD}
  237. function fpc_mul_dword_to_qword(f1,f2 : dword) : qword;[public,alias: 'FPC_MUL_DWORD_TO_QWORD']; compilerproc; assembler; nostackframe;
  238. asm
  239. push bp
  240. mov bp, sp
  241. mov di,word[bp + 8 + extra_param_offset] // word[f1]
  242. mov bx,word[bp + 10 + extra_param_offset] // word[f1+2]
  243. mov si,word[bp + 4 + extra_param_offset] // word[f2]
  244. mov ax,word[bp + 6 + extra_param_offset] // word[f2+2]
  245. mov cx,ax
  246. mul bx
  247. xchg ax,bx
  248. mov bp,dx
  249. mul si
  250. xchg ax,cx
  251. add bx,dx
  252. adc bp,0
  253. mul di
  254. add cx,ax
  255. adc bx,dx
  256. adc bp,0
  257. mov ax,di
  258. mul si
  259. add cx,dx
  260. adc bx,0
  261. adc bp,0
  262. mov dx,ax
  263. mov ax,bp
  264. pop bp
  265. end;
  266. {$define FPC_SYSTEM_HAS_DIV_QWORD}
  267. function fpc_div_qword( n, z: qword ): qword; [public, alias:'FPC_DIV_QWORD']; compilerproc;
  268. // Generic "schoolbook" division algorithm
  269. // see [D.Knuth, TAOCP, vol.2, sect.4.3.1] for explanation
  270. var
  271. dig: byte;
  272. u: array [0..6] of word;
  273. begin
  274. asm
  275. mov dig,3 // quotient contains 3 digits for "long" division path
  276. // Check parameters
  277. mov dx,word [n]
  278. mov cx,word [n+2]
  279. mov bx,word [n+4]
  280. mov ax,word [n+6]
  281. mov di,ax
  282. or di,bx
  283. or di,cx
  284. jnz @@s1
  285. or di,dx
  286. jz @@q // div by 0
  287. // Short division
  288. mov dig,al
  289. mov dx,word [z+6]
  290. cmp dx,di
  291. jc @@s0
  292. xchg ax,dx
  293. div di
  294. @@s0: mov word [result+6],ax
  295. mov ax,word [z+4]
  296. div di
  297. mov word [result+4],ax
  298. mov ax,word [z+2]
  299. div di
  300. mov word [result+2],ax
  301. mov ax,word [z]
  302. div di
  303. mov word [result],ax
  304. jmp @@q
  305. @@s1: // Long division
  306. xor si,si
  307. cmp word [z],dx
  308. mov di,word [z+2]
  309. sbb di,cx
  310. mov di,word [z+4]
  311. sbb di,bx
  312. mov di,word [z+6]
  313. sbb di,ax
  314. jnc @@n0
  315. // underflow: return 0
  316. mov dig,0
  317. mov word [result],si
  318. mov word [result+2],si
  319. mov word [result+4],si
  320. mov word [result+6],si
  321. jmp @@q
  322. @@n0: // D1. Normalize divisor:
  323. // n := n shl lzv, so that 2^63<=n<2^64
  324. // Note: n>=0x10000 leads to lzv<=47 and q3=0
  325. mov word [result+6],si // q3:=0
  326. mov di,si
  327. test ax,ax
  328. jnz @@n2
  329. @@n1: add si,16
  330. or ax,bx
  331. mov bx,cx
  332. mov cx,dx
  333. mov dx,di
  334. jz @@n1
  335. @@n2: test ah,ah
  336. jnz @@n4
  337. add si,8
  338. or ah,al
  339. mov al,bh
  340. mov bh,bl
  341. mov bl,ch
  342. mov ch,cl
  343. mov cl,dh
  344. mov dh,dl
  345. mov dl,0
  346. js @@n5
  347. @@n3: inc si
  348. shl dx,1
  349. rcl cx,1
  350. rcl bx,1
  351. adc ax,ax
  352. @@n4: jns @@n3
  353. @@n5: mov word [n],dx
  354. mov word [n+2],cx
  355. mov word [n+4],bx
  356. mov word [n+6],ax
  357. // Adjust divident accordingly:
  358. // u := uint128(z) shl lzv; lzv=si=0..47; di=0
  359. mov dx,word [z]
  360. mov cx,word [z+2]
  361. mov bx,word [z+4]
  362. mov ax,word [z+6]
  363. push bp
  364. mov bp,si // save lzv
  365. test si,8
  366. jz @@m0
  367. // << by odd-8
  368. xchg al,ah
  369. mov di,ax
  370. and di,0FFh
  371. mov al,bh
  372. mov bh,bl
  373. mov bl,ch
  374. mov ch,cl
  375. mov cl,dh
  376. mov dh,dl
  377. xor dl,dl
  378. @@m0: and si,7
  379. jz @@m2
  380. // << 1..7
  381. @@m1: shl dx,1
  382. rcl cx,1
  383. rcl bx,1
  384. rcl ax,1
  385. rcl di,1
  386. dec si
  387. jnz @@m1
  388. @@m2: // si=0, bp=lzv
  389. // di:ax:bx:cx:dx shifted by 0..15; 0|16|32 shifts remain
  390. sub bp,16
  391. jc @@m5
  392. sub bp,16
  393. jc @@m4
  394. // << 32
  395. pop bp
  396. mov word [u],si
  397. mov word [u+2],si
  398. mov word [u+4],dx
  399. mov word [u+6],cx
  400. mov word [u+8],bx
  401. mov word [u+10],ax
  402. mov word [u+12],di
  403. jmp @@m6
  404. @@m4: // << 16
  405. pop bp
  406. mov word [u],si
  407. mov word [u+2],dx
  408. mov word [u+4],cx
  409. mov word [u+6],bx
  410. mov word [u+8],ax
  411. mov word [u+10],di
  412. mov word [u+12],si
  413. jmp @@m6
  414. @@m5: // << 0
  415. pop bp
  416. mov word [u],dx
  417. mov word [u+2],cx
  418. mov word [u+4],bx
  419. mov word [u+6],ax
  420. mov word [u+8],di
  421. mov word [u+10],si
  422. mov word [u+12],si
  423. @@m6: // D2. Start from j:=2 (since u7=0 and u6<n3), si:=@u[j], bx:=@q[j]
  424. lea si,word [u+4]
  425. lea bx,word [result+4]
  426. @@d0: push bx
  427. // D3. Estimate the next quotient digit:
  428. // q_hat := [u(j+4):u(j+3)]/[n3]
  429. // use max.possible q_hat if division overflows
  430. mov ax,-1
  431. mov dx,ss:[si+8]
  432. mov di,word [n+6]
  433. cmp dx,di
  434. jnc @@d1
  435. mov ax,ss:[si+6]
  436. div di
  437. @@d1: // D4. Multiply & subtract calculating partial reminder:
  438. // r := [u(j+4):u(j+3):u(j+2):u(j+1):u(j)]-q_hat*[n3:n2:n1:n0]
  439. push ax // q_hat
  440. push si // @u[j]
  441. mov si,ax
  442. mul word [n]
  443. mov bx,ax
  444. mov cx,dx
  445. mov ax,word [n+2]
  446. mul si
  447. add cx,ax
  448. adc dx,0
  449. mov di,dx
  450. mov ax,word [n+4]
  451. mul si
  452. add di,ax
  453. adc dx,0
  454. xchg dx,si
  455. mov ax,word [n+6]
  456. mul dx
  457. add ax,si
  458. pop si // @u[j]
  459. adc dx,0
  460. sub ss:[si],bx
  461. sbb ss:[si+2],cx
  462. sbb ss:[si+4],di
  463. sbb ss:[si+6],ax
  464. sbb ss:[si+8],dx
  465. pop di // q_hat
  466. // D5. Test reminder
  467. jnc @@d3 // 0<=r<n
  468. // D6. Add back once or twice correcting the quotient and remainder:
  469. // while (r<0) do { q_hat--; r+=n; }
  470. mov dx,word [n]
  471. mov cx,word [n+2]
  472. mov bx,word [n+4]
  473. mov ax,word [n+6]
  474. @@d2: dec di
  475. add ss:[si],dx
  476. adc ss:[si+2],cx
  477. adc ss:[si+4],bx
  478. adc ss:[si+6],ax
  479. adc word ss:[si+8],0
  480. jnc @@d2
  481. @@d3: // D7. Store q[j], loop on j--
  482. pop bx // @q[j]
  483. dec si
  484. dec si
  485. mov ss:[bx],di
  486. dec bx
  487. dec bx
  488. dec dig
  489. jnz @@d0
  490. @@q:
  491. end;
  492. if dig<>0 then
  493. HandleErrorAddrFrameInd(200,get_pc_addr,get_frame);
  494. end;
  495. {$define FPC_SYSTEM_HAS_MOD_QWORD}
  496. function fpc_mod_qword( n, z: qword ): qword; [public, alias:'FPC_MOD_QWORD']; compilerproc;
  497. // Generic "schoolbook" division algorithm
  498. // see [D.Knuth, TAOCP, vol.2, sect.4.3.1] for explanation
  499. var
  500. dig: byte;
  501. lzv: word;
  502. u: array [0..6] of word;
  503. begin
  504. asm
  505. mov dig,3 // quotient contains 3 digist for "long" division path
  506. // Check parameters
  507. mov dx,word [n]
  508. mov cx,word [n+2]
  509. mov bx,word [n+4]
  510. mov ax,word [n+6]
  511. mov di,ax
  512. or di,bx
  513. or di,cx
  514. jnz @@s1
  515. or di,dx
  516. jz @@q // div by 0
  517. // Short division
  518. mov dig,al
  519. mov dx,word [z+6]
  520. cmp dx,di
  521. jc @@s0
  522. xchg ax,dx
  523. div di
  524. @@s0: mov ax,word [z+4]
  525. div di
  526. mov ax,word [z+2]
  527. div di
  528. mov ax,word [z]
  529. div di
  530. mov word [result],dx
  531. mov word [result+2],cx
  532. mov word [result+4],cx
  533. mov word [result+6],cx
  534. jmp @@q
  535. @@s1: // Long division
  536. xor si,si
  537. cmp word [z],dx
  538. mov di,word [z+2]
  539. sbb di,cx
  540. mov di,word [z+4]
  541. sbb di,bx
  542. mov di,word [z+6]
  543. sbb di,ax
  544. jnc @@n0
  545. // underflow: return z
  546. mov dig,0
  547. mov dx,word [z]
  548. mov cx,word [z+2]
  549. mov bx,word [z+4]
  550. mov ax,word [z+6]
  551. jmp @@r6
  552. @@n0: // D1. Normalize divisor:
  553. // n := n shl lzv, so that 2^63<=n<2^64
  554. // Note: n>=0x10000 leads to lzv<=47
  555. mov di,si
  556. test ax,ax
  557. jnz @@n2
  558. @@n1: add si,16
  559. or ax,bx
  560. mov bx,cx
  561. mov cx,dx
  562. mov dx,di
  563. jz @@n1
  564. @@n2: test ah,ah
  565. jnz @@n4
  566. add si,8
  567. or ah,al
  568. mov al,bh
  569. mov bh,bl
  570. mov bl,ch
  571. mov ch,cl
  572. mov cl,dh
  573. mov dh,dl
  574. mov dl,0
  575. js @@n5
  576. @@n3: inc si
  577. shl dx,1
  578. rcl cx,1
  579. rcl bx,1
  580. adc ax,ax
  581. @@n4: jns @@n3
  582. @@n5: mov word [n],dx
  583. mov word [n+2],cx
  584. mov word [n+4],bx
  585. mov word [n+6],ax
  586. mov lzv,si
  587. // Adjust divident accordingly:
  588. // u := uint128(z) shl lzv; lzv=si=0..47; di=0
  589. mov dx,word [z]
  590. mov cx,word [z+2]
  591. mov bx,word [z+4]
  592. mov ax,word [z+6]
  593. push bp
  594. mov bp,si // save lzv
  595. test si,8
  596. jz @@m0
  597. // << by odd-8
  598. xchg al,ah
  599. mov di,ax
  600. and di,0FFh
  601. mov al,bh
  602. mov bh,bl
  603. mov bl,ch
  604. mov ch,cl
  605. mov cl,dh
  606. mov dh,dl
  607. xor dl,dl
  608. @@m0: and si,7
  609. jz @@m2
  610. // << 1..7
  611. @@m1: shl dx,1
  612. rcl cx,1
  613. rcl bx,1
  614. rcl ax,1
  615. rcl di,1
  616. dec si
  617. jnz @@m1
  618. @@m2: // si=0, bp=lzv
  619. // di:ax:bx:cx:dx shifted by 0..15; 0|16|32 shifts remain
  620. sub bp,16
  621. jc @@m5
  622. sub bp,16
  623. jc @@m4
  624. // << 32
  625. pop bp
  626. mov word [u],si
  627. mov word [u+2],si
  628. mov word [u+4],dx
  629. mov word [u+6],cx
  630. mov word [u+8],bx
  631. mov word [u+10],ax
  632. mov word [u+12],di
  633. jmp @@m6
  634. @@m4: // << 16
  635. pop bp
  636. mov word [u],si
  637. mov word [u+2],dx
  638. mov word [u+4],cx
  639. mov word [u+6],bx
  640. mov word [u+8],ax
  641. mov word [u+10],di
  642. mov word [u+12],si
  643. jmp @@m6
  644. @@m5: // << 0
  645. pop bp
  646. mov word [u],dx
  647. mov word [u+2],cx
  648. mov word [u+4],bx
  649. mov word [u+6],ax
  650. mov word [u+8],di
  651. mov word [u+10],si
  652. mov word [u+12],si
  653. @@m6: // D2. Start from j:=2 (since u7=0 and u6<n3), si:=@u[j]
  654. lea si,word [u+4]
  655. @@d0: // D3. Estimate the next quotient digit:
  656. // q_hat := [u(j+4):u(j+3)]/[n3]
  657. // use max.possible q_hat if division overflows
  658. mov ax,-1
  659. mov dx,ss:[si+8]
  660. mov di,word [n+6]
  661. cmp dx,di
  662. jnc @@d1
  663. mov ax,ss:[si+6]
  664. div di
  665. @@d1: // D4. Multiply & subtract calculating partial reminder:
  666. // r := [u(j+4):u(j+3):u(j+2):u(j+1):u(j)]-q_hat*[n3:n2:n1:n0]
  667. push si // @u[j]
  668. mov si,ax // q_hat
  669. mul word [n]
  670. mov bx,ax
  671. mov cx,dx
  672. mov ax,word [n+2]
  673. mul si
  674. add cx,ax
  675. adc dx,0
  676. mov di,dx
  677. mov ax,word [n+4]
  678. mul si
  679. add di,ax
  680. adc dx,0
  681. xchg dx,si
  682. mov ax,word [n+6]
  683. mul dx
  684. add ax,si
  685. pop si // @u[j]
  686. adc dx,0
  687. sub ss:[si],bx
  688. sbb ss:[si+2],cx
  689. sbb ss:[si+4],di
  690. sbb ss:[si+6],ax
  691. mov di,ss:[si+8]
  692. sbb di,dx
  693. // D5. Test reminder
  694. jnc @@d3 // 0<=r<n
  695. // D6. Add back once or twice correcting the remainder:
  696. // while (r<0) do { r+=n; }
  697. mov dx,word [n]
  698. mov cx,word [n+2]
  699. mov bx,word [n+4]
  700. mov ax,word [n+6]
  701. @@d2: add ss:[si],dx
  702. adc ss:[si+2],cx
  703. adc ss:[si+4],bx
  704. adc ss:[si+6],ax
  705. adc di,0
  706. jnc @@d2
  707. @@d3: // D7. Loop on j--
  708. dec si
  709. dec si
  710. dec dig
  711. jnz @@d0
  712. // D8. "Unnormalize" and return reminder:
  713. // result := [u3:u2:u1:u0] shr lzv
  714. xor ax,ax
  715. mov si,lzv
  716. sub si,16
  717. jc @@r2
  718. sub si,16
  719. jc @@r1
  720. // >> 32..47
  721. mov bx,ax
  722. mov cx,word [u+6]
  723. mov dx,word [u+4]
  724. jmp @@r3
  725. @@r1: // >> 16..31
  726. mov bx,word [u+6]
  727. mov cx,word [u+4]
  728. mov dx,word [u+2]
  729. jmp @@r3
  730. @@r2: // >> 0..15
  731. mov ax,word [u+6]
  732. mov bx,word [u+4]
  733. mov cx,word [u+2]
  734. mov dx,word [u]
  735. @@r3: and si,15
  736. sub si,8
  737. jc @@r4
  738. // >> 8..15
  739. mov dl,dh
  740. mov dh,cl
  741. mov cl,ch
  742. mov ch,bl
  743. mov bl,bh
  744. mov bh,al
  745. mov al,ah
  746. xor ah,ah
  747. @@r4: and si,7
  748. jz @@r6
  749. // >> 1..7
  750. @@r5: shr ax,1
  751. rcr bx,1
  752. rcr cx,1
  753. rcr dx,1
  754. dec si
  755. jnz @@r5
  756. @@r6: mov word [result],dx
  757. mov word [result+2],cx
  758. mov word [result+4],bx
  759. mov word [result+6],ax
  760. @@q:
  761. end;
  762. if dig<>0 then
  763. HandleErrorAddrFrameInd(200,get_pc_addr,get_frame);
  764. end;