ttmathuint_x86.h 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602
  1. /*
  2. * This file is a part of TTMath Bignum Library
  3. * and is distributed under the (new) BSD licence.
  4. * Author: Tomasz Sowa <[email protected]>
  5. */
  6. /*
  7. * Copyright (c) 2006-2009, Tomasz Sowa
  8. * All rights reserved.
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted provided that the following conditions are met:
  12. *
  13. * * Redistributions of source code must retain the above copyright notice,
  14. * this list of conditions and the following disclaimer.
  15. *
  16. * * Redistributions in binary form must reproduce the above copyright
  17. * notice, this list of conditions and the following disclaimer in the
  18. * documentation and/or other materials provided with the distribution.
  19. *
  20. * * Neither the name Tomasz Sowa nor the names of contributors to this
  21. * project may be used to endorse or promote products derived
  22. * from this software without specific prior written permission.
  23. *
  24. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  25. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  28. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  29. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  30. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  31. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  32. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  33. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  34. * THE POSSIBILITY OF SUCH DAMAGE.
  35. */
  36. #ifndef headerfilettmathuint_x86
  37. #define headerfilettmathuint_x86
  38. #ifndef TTMATH_NOASM
  39. #ifdef TTMATH_PLATFORM32
  40. /*!
  41. \file ttmathuint_x86.h
  42. \brief template class UInt<uint> with assembler code for 32bit x86 processors
  43. this file is included at the end of ttmathuint.h
  44. */
  45. /*!
  46. \brief a namespace for the TTMath library
  47. */
  48. namespace ttmath
  49. {
  50. /*!
  51. returning the string represents the currect type of the library
  52. we have following types:
  53. asm_vc_32 - with asm code designed for Microsoft Visual C++ (32 bits)
  54. asm_gcc_32 - with asm code designed for GCC (32 bits)
  55. asm_vc_64 - with asm for VC (64 bit)
  56. asm_gcc_64 - with asm for GCC (64 bit)
  57. no_asm_32 - pure C++ version (32 bit) - without any asm code
  58. no_asm_64 - pure C++ version (64 bit) - without any asm code
  59. */
  60. template<uint value_size>
  61. const char * UInt<value_size>::LibTypeStr()
  62. {
  63. #ifndef __GNUC__
  64. static const char info[] = "asm_vc_32";
  65. #endif
  66. #ifdef __GNUC__
  67. static const char info[] = "asm_gcc_32";
  68. #endif
  69. return info;
  70. }
  71. /*!
  72. returning the currect type of the library
  73. */
  74. template<uint value_size>
  75. LibTypeCode UInt<value_size>::LibType()
  76. {
  77. #ifndef __GNUC__
  78. LibTypeCode info = asm_vc_32;
  79. #endif
  80. #ifdef __GNUC__
  81. LibTypeCode info = asm_gcc_32;
  82. #endif
  83. return info;
  84. }
  85. /*!
  86. *
  87. * basic mathematic functions
  88. *
  89. */
  90. /*!
  91. adding ss2 to the this and adding carry if it's defined
  92. (this = this + ss2 + c)
  93. c must be zero or one (might be a bigger value than 1)
  94. function returns carry (1) (if it has been)
  95. */
  96. template<uint value_size>
  97. uint UInt<value_size>::Add(const UInt<value_size> & ss2, uint c)
  98. {
  99. uint b = value_size;
  100. uint * p1 = table;
  101. uint * p2 = const_cast<uint*>(ss2.table);
  102. // we don't have to use TTMATH_REFERENCE_ASSERT here
  103. // this algorithm doesn't require it
  104. #ifndef __GNUC__
  105. // this part might be compiled with for example visual c
  106. __asm
  107. {
  108. push eax
  109. push ebx
  110. push ecx
  111. push edx
  112. push esi
  113. mov ecx,[b]
  114. mov ebx,[p1]
  115. mov esi,[p2]
  116. xor edx,edx // edx=0
  117. mov eax,[c]
  118. neg eax // CF=1 if rax!=0 , CF=0 if rax==0
  119. ttmath_loop:
  120. mov eax,[esi+edx*4]
  121. adc [ebx+edx*4],eax
  122. inc edx
  123. dec ecx
  124. jnz ttmath_loop
  125. adc ecx, ecx
  126. mov [c], ecx
  127. pop esi
  128. pop edx
  129. pop ecx
  130. pop ebx
  131. pop eax
  132. }
  133. #endif
  134. #ifdef __GNUC__
  135. uint dummy, dummy2;
  136. // this part should be compiled with gcc
  137. __asm__ __volatile__(
  138. "xorl %%edx, %%edx \n"
  139. "negl %%eax \n" // CF=1 if rax!=0 , CF=0 if rax==0
  140. "1: \n"
  141. "movl (%%esi,%%edx,4), %%eax \n"
  142. "adcl %%eax, (%%ebx,%%edx,4) \n"
  143. "incl %%edx \n"
  144. "decl %%ecx \n"
  145. "jnz 1b \n"
  146. "adc %%ecx, %%ecx \n"
  147. : "=c" (c), "=a" (dummy), "=d" (dummy2)
  148. : "0" (b), "1" (c), "b" (p1), "S" (p2)
  149. : "cc", "memory" );
  150. #endif
  151. TTMATH_LOGC("UInt::Add", c)
  152. return c;
  153. }
  154. /*!
  155. adding one word (at a specific position)
  156. and returning a carry (if it has been)
  157. e.g.
  158. if we've got (value_size=3):
  159. table[0] = 10;
  160. table[1] = 30;
  161. table[2] = 5;
  162. and we call:
  163. AddInt(2,1)
  164. then it'll be:
  165. table[0] = 10;
  166. table[1] = 30 + 2;
  167. table[2] = 5;
  168. of course if there was a carry from table[2] it would be returned
  169. */
  170. template<uint value_size>
  171. uint UInt<value_size>::AddInt(uint value, uint index)
  172. {
  173. uint b = value_size;
  174. uint * p1 = table;
  175. uint c;
  176. TTMATH_ASSERT( index < value_size )
  177. #ifndef __GNUC__
  178. __asm
  179. {
  180. push eax
  181. push ebx
  182. push ecx
  183. push edx
  184. mov ecx, [b]
  185. sub ecx, [index]
  186. mov edx, [index]
  187. mov ebx, [p1]
  188. mov eax, [value]
  189. ttmath_loop:
  190. add [ebx+edx*4], eax
  191. jnc ttmath_end
  192. mov eax, 1
  193. inc edx
  194. dec ecx
  195. jnz ttmath_loop
  196. ttmath_end:
  197. setc al
  198. movzx edx, al
  199. mov [c], edx
  200. pop edx
  201. pop ecx
  202. pop ebx
  203. pop eax
  204. }
  205. #endif
  206. #ifdef __GNUC__
  207. uint dummy, dummy2;
  208. __asm__ __volatile__(
  209. "subl %%edx, %%ecx \n"
  210. "1: \n"
  211. "addl %%eax, (%%ebx,%%edx,4) \n"
  212. "jnc 2f \n"
  213. "movl $1, %%eax \n"
  214. "incl %%edx \n"
  215. "decl %%ecx \n"
  216. "jnz 1b \n"
  217. "2: \n"
  218. "setc %%al \n"
  219. "movzx %%al, %%edx \n"
  220. : "=d" (c), "=a" (dummy), "=c" (dummy2)
  221. : "0" (index), "1" (value), "2" (b), "b" (p1)
  222. : "cc", "memory" );
  223. #endif
  224. TTMATH_LOGC("UInt::AddInt", c)
  225. return c;
  226. }
  227. /*!
  228. adding only two unsigned words to the existing value
  229. and these words begin on the 'index' position
  230. (it's used in the multiplication algorithm 2)
  231. index should be equal or smaller than value_size-2 (index <= value_size-2)
  232. x1 - lower word, x2 - higher word
  233. for example if we've got value_size equal 4 and:
  234. table[0] = 3
  235. table[1] = 4
  236. table[2] = 5
  237. table[3] = 6
  238. then let
  239. x1 = 10
  240. x2 = 20
  241. and
  242. index = 1
  243. the result of this method will be:
  244. table[0] = 3
  245. table[1] = 4 + x1 = 14
  246. table[2] = 5 + x2 = 25
  247. table[3] = 6
  248. and no carry at the end of table[3]
  249. (of course if there was a carry in table[2](5+20) then
  250. this carry would be passed to the table[3] etc.)
  251. */
  252. template<uint value_size>
  253. uint UInt<value_size>::AddTwoInts(uint x2, uint x1, uint index)
  254. {
  255. uint b = value_size;
  256. uint * p1 = table;
  257. uint c;
  258. TTMATH_ASSERT( index < value_size - 1 )
  259. #ifndef __GNUC__
  260. __asm
  261. {
  262. push eax
  263. push ebx
  264. push ecx
  265. push edx
  266. mov ecx, [b]
  267. sub ecx, [index]
  268. mov ebx, [p1]
  269. mov edx, [index]
  270. mov eax, [x1]
  271. add [ebx+edx*4], eax
  272. inc edx
  273. dec ecx
  274. mov eax, [x2]
  275. ttmath_loop:
  276. adc [ebx+edx*4], eax
  277. jnc ttmath_end
  278. mov eax, 0
  279. inc edx
  280. dec ecx
  281. jnz ttmath_loop
  282. ttmath_end:
  283. setc al
  284. movzx edx, al
  285. mov [c], edx
  286. pop edx
  287. pop ecx
  288. pop ebx
  289. pop eax
  290. }
  291. #endif
  292. #ifdef __GNUC__
  293. uint dummy, dummy2;
  294. __asm__ __volatile__(
  295. "subl %%edx, %%ecx \n"
  296. "addl %%esi, (%%ebx,%%edx,4) \n"
  297. "incl %%edx \n"
  298. "decl %%ecx \n"
  299. "1: \n"
  300. "adcl %%eax, (%%ebx,%%edx,4) \n"
  301. "jnc 2f \n"
  302. "mov $0, %%eax \n"
  303. "incl %%edx \n"
  304. "decl %%ecx \n"
  305. "jnz 1b \n"
  306. "2: \n"
  307. "setc %%al \n"
  308. "movzx %%al, %%eax \n"
  309. : "=a" (c), "=c" (dummy), "=d" (dummy2)
  310. : "0" (x2), "1" (b), "2" (index), "b" (p1), "S" (x1)
  311. : "cc", "memory" );
  312. #endif
  313. TTMATH_LOGC("UInt::AddTwoInts", c)
  314. return c;
  315. }
  316. /*!
  317. this static method addes one vector to the other
  318. 'ss1' is larger in size or equal to 'ss2'
  319. ss1 points to the first (larger) vector
  320. ss2 points to the second vector
  321. ss1_size - size of the ss1 (and size of the result too)
  322. ss2_size - size of the ss2
  323. result - is the result vector (which has size the same as ss1: ss1_size)
  324. Example: ss1_size is 5, ss2_size is 3
  325. ss1: ss2: result (output):
  326. 5 1 5+1
  327. 4 3 4+3
  328. 2 7 2+7
  329. 6 6
  330. 9 9
  331. of course the carry is propagated and will be returned from the last item
  332. (this method is used by the Karatsuba multiplication algorithm)
  333. */
  334. template<uint value_size>
  335. uint UInt<value_size>::AddVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
  336. {
  337. TTMATH_ASSERT( ss1_size >= ss2_size )
  338. uint rest = ss1_size - ss2_size;
  339. uint c;
  340. #ifndef __GNUC__
  341. // this part might be compiled with for example visual c
  342. __asm
  343. {
  344. pushad
  345. mov ecx, [ss2_size]
  346. xor edx, edx // edx = 0, cf = 0
  347. mov esi, [ss1]
  348. mov ebx, [ss2]
  349. mov edi, [result]
  350. ttmath_loop:
  351. mov eax, [esi+edx*4]
  352. adc eax, [ebx+edx*4]
  353. mov [edi+edx*4], eax
  354. inc edx
  355. dec ecx
  356. jnz ttmath_loop
  357. adc ecx, ecx // ecx has the cf state
  358. mov ebx, [rest]
  359. or ebx, ebx
  360. jz ttmath_end
  361. xor ebx, ebx // ebx = 0
  362. neg ecx // setting cf from ecx
  363. mov ecx, [rest] // ecx is != 0
  364. ttmath_loop2:
  365. mov eax, [esi+edx*4]
  366. adc eax, ebx
  367. mov [edi+edx*4], eax
  368. inc edx
  369. dec ecx
  370. jnz ttmath_loop2
  371. adc ecx, ecx
  372. ttmath_end:
  373. mov [c], ecx
  374. popad
  375. }
  376. #endif
  377. #ifdef __GNUC__
  378. // this part should be compiled with gcc
  379. uint dummy1, dummy2, dummy3;
  380. __asm__ __volatile__(
  381. "push %%edx \n"
  382. "xor %%edx, %%edx \n" // edx = 0, cf = 0
  383. "1: \n"
  384. "mov (%%esi,%%edx,4), %%eax \n"
  385. "adc (%%ebx,%%edx,4), %%eax \n"
  386. "mov %%eax, (%%edi,%%edx,4) \n"
  387. "inc %%edx \n"
  388. "dec %%ecx \n"
  389. "jnz 1b \n"
  390. "adc %%ecx, %%ecx \n" // ecx has the cf state
  391. "pop %%eax \n" // eax = rest
  392. "or %%eax, %%eax \n"
  393. "jz 3f \n"
  394. "xor %%ebx, %%ebx \n" // ebx = 0
  395. "neg %%ecx \n" // setting cf from ecx
  396. "mov %%eax, %%ecx \n" // ecx=rest and is != 0
  397. "2: \n"
  398. "mov (%%esi, %%edx, 4), %%eax \n"
  399. "adc %%ebx, %%eax \n"
  400. "mov %%eax, (%%edi, %%edx, 4) \n"
  401. "inc %%edx \n"
  402. "dec %%ecx \n"
  403. "jnz 2b \n"
  404. "adc %%ecx, %%ecx \n"
  405. "3: \n"
  406. : "=a" (dummy1), "=b" (dummy2), "=c" (c), "=d" (dummy3)
  407. : "1" (ss2), "2" (ss2_size), "3" (rest), "S" (ss1), "D" (result)
  408. : "cc", "memory" );
  409. #endif
  410. TTMATH_VECTOR_LOGC("UInt::AddVector", c, result, ss1_size)
  411. return c;
  412. }
  413. /*!
  414. subtracting ss2 from the 'this' and subtracting
  415. carry if it has been defined
  416. (this = this - ss2 - c)
  417. c must be zero or one (might be a bigger value than 1)
  418. function returns carry (1) (if it has been)
  419. */
  420. template<uint value_size>
  421. uint UInt<value_size>::Sub(const UInt<value_size> & ss2, uint c)
  422. {
  423. uint b = value_size;
  424. uint * p1 = table;
  425. uint * p2 = const_cast<uint*>(ss2.table);
  426. // we don't have to use TTMATH_REFERENCE_ASSERT here
  427. // this algorithm doesn't require it
  428. #ifndef __GNUC__
  429. __asm
  430. {
  431. push eax
  432. push ebx
  433. push ecx
  434. push edx
  435. push esi
  436. mov ecx,[b]
  437. mov ebx,[p1]
  438. mov esi,[p2]
  439. xor edx,edx // edx=0
  440. mov eax,[c]
  441. neg eax // CF=1 if rax!=0 , CF=0 if rax==0
  442. ttmath_loop:
  443. mov eax,[esi+edx*4]
  444. sbb [ebx+edx*4],eax
  445. inc edx
  446. dec ecx
  447. jnz ttmath_loop
  448. adc ecx, ecx
  449. mov [c], ecx
  450. pop esi
  451. pop edx
  452. pop ecx
  453. pop ebx
  454. pop eax
  455. }
  456. #endif
  457. #ifdef __GNUC__
  458. uint dummy, dummy2;
  459. __asm__ __volatile__(
  460. "xorl %%edx, %%edx \n"
  461. "negl %%eax \n" // CF=1 if rax!=0 , CF=0 if rax==0
  462. "1: \n"
  463. "movl (%%esi,%%edx,4), %%eax \n"
  464. "sbbl %%eax, (%%ebx,%%edx,4) \n"
  465. "incl %%edx \n"
  466. "decl %%ecx \n"
  467. "jnz 1b \n"
  468. "adc %%ecx, %%ecx \n"
  469. : "=c" (c), "=a" (dummy), "=d" (dummy2)
  470. : "0" (b), "1" (c), "b" (p1), "S" (p2)
  471. : "cc", "memory" );
  472. #endif
  473. TTMATH_LOGC("UInt::Sub", c)
  474. return c;
  475. }
  476. /*!
  477. this method subtracts one word (at a specific position)
  478. and returns a carry (if it was)
  479. e.g.
  480. if we've got (value_size=3):
  481. table[0] = 10;
  482. table[1] = 30;
  483. table[2] = 5;
  484. and we call:
  485. SubInt(2,1)
  486. then it'll be:
  487. table[0] = 10;
  488. table[1] = 30 - 2;
  489. table[2] = 5;
  490. of course if there was a carry from table[2] it would be returned
  491. */
  492. template<uint value_size>
  493. uint UInt<value_size>::SubInt(uint value, uint index)
  494. {
  495. uint b = value_size;
  496. uint * p1 = table;
  497. uint c;
  498. TTMATH_ASSERT( index < value_size )
  499. #ifndef __GNUC__
  500. __asm
  501. {
  502. push eax
  503. push ebx
  504. push ecx
  505. push edx
  506. mov ecx, [b]
  507. sub ecx, [index]
  508. mov edx, [index]
  509. mov ebx, [p1]
  510. mov eax, [value]
  511. ttmath_loop:
  512. sub [ebx+edx*4], eax
  513. jnc ttmath_end
  514. mov eax, 1
  515. inc edx
  516. dec ecx
  517. jnz ttmath_loop
  518. ttmath_end:
  519. setc al
  520. movzx edx, al
  521. mov [c], edx
  522. pop edx
  523. pop ecx
  524. pop ebx
  525. pop eax
  526. }
  527. #endif
  528. #ifdef __GNUC__
  529. uint dummy, dummy2;
  530. __asm__ __volatile__(
  531. "subl %%edx, %%ecx \n"
  532. "1: \n"
  533. "subl %%eax, (%%ebx,%%edx,4) \n"
  534. "jnc 2f \n"
  535. "movl $1, %%eax \n"
  536. "incl %%edx \n"
  537. "decl %%ecx \n"
  538. "jnz 1b \n"
  539. "2: \n"
  540. "setc %%al \n"
  541. "movzx %%al, %%edx \n"
  542. : "=d" (c), "=a" (dummy), "=c" (dummy2)
  543. : "0" (index), "1" (value), "2" (b), "b" (p1)
  544. : "cc", "memory" );
  545. #endif
  546. TTMATH_LOGC("UInt::SubInt", c)
  547. return c;
  548. }
  549. /*!
  550. this static method subtractes one vector from the other
  551. 'ss1' is larger in size or equal to 'ss2'
  552. ss1 points to the first (larger) vector
  553. ss2 points to the second vector
  554. ss1_size - size of the ss1 (and size of the result too)
  555. ss2_size - size of the ss2
  556. result - is the result vector (which has size the same as ss1: ss1_size)
  557. Example: ss1_size is 5, ss2_size is 3
  558. ss1: ss2: result (output):
  559. 5 1 5-1
  560. 4 3 4-3
  561. 2 7 2-7
  562. 6 6-1 (the borrow from previous item)
  563. 9 9
  564. return (carry): 0
  565. of course the carry (borrow) is propagated and will be returned from the last item
  566. (this method is used by the Karatsuba multiplication algorithm)
  567. */
  568. template<uint value_size>
  569. uint UInt<value_size>::SubVector(const uint * ss1, const uint * ss2, uint ss1_size, uint ss2_size, uint * result)
  570. {
  571. TTMATH_ASSERT( ss1_size >= ss2_size )
  572. uint rest = ss1_size - ss2_size;
  573. uint c;
  574. #ifndef __GNUC__
  575. // this part might be compiled with for example visual c
  576. /*
  577. the asm code is nearly the same as in AddVector
  578. only two instructions 'adc' are changed to 'sbb'
  579. */
  580. __asm
  581. {
  582. pushad
  583. mov ecx, [ss2_size]
  584. xor edx, edx // edx = 0, cf = 0
  585. mov esi, [ss1]
  586. mov ebx, [ss2]
  587. mov edi, [result]
  588. ttmath_loop:
  589. mov eax, [esi+edx*4]
  590. sbb eax, [ebx+edx*4]
  591. mov [edi+edx*4], eax
  592. inc edx
  593. dec ecx
  594. jnz ttmath_loop
  595. adc ecx, ecx // ecx has the cf state
  596. mov ebx, [rest]
  597. or ebx, ebx
  598. jz ttmath_end
  599. xor ebx, ebx // ebx = 0
  600. neg ecx // setting cf from ecx
  601. mov ecx, [rest] // ecx is != 0
  602. ttmath_loop2:
  603. mov eax, [esi+edx*4]
  604. sbb eax, ebx
  605. mov [edi+edx*4], eax
  606. inc edx
  607. dec ecx
  608. jnz ttmath_loop2
  609. adc ecx, ecx
  610. ttmath_end:
  611. mov [c], ecx
  612. popad
  613. }
  614. #endif
  615. #ifdef __GNUC__
  616. // this part should be compiled with gcc
  617. uint dummy1, dummy2, dummy3;
  618. __asm__ __volatile__(
  619. "push %%edx \n"
  620. "xor %%edx, %%edx \n" // edx = 0, cf = 0
  621. "1: \n"
  622. "mov (%%esi,%%edx,4), %%eax \n"
  623. "sbb (%%ebx,%%edx,4), %%eax \n"
  624. "mov %%eax, (%%edi,%%edx,4) \n"
  625. "inc %%edx \n"
  626. "dec %%ecx \n"
  627. "jnz 1b \n"
  628. "adc %%ecx, %%ecx \n" // ecx has the cf state
  629. "pop %%eax \n" // eax = rest
  630. "or %%eax, %%eax \n"
  631. "jz 3f \n"
  632. "xor %%ebx, %%ebx \n" // ebx = 0
  633. "neg %%ecx \n" // setting cf from ecx
  634. "mov %%eax, %%ecx \n" // ecx=rest and is != 0
  635. "2: \n"
  636. "mov (%%esi, %%edx, 4), %%eax \n"
  637. "sbb %%ebx, %%eax \n"
  638. "mov %%eax, (%%edi, %%edx, 4) \n"
  639. "inc %%edx \n"
  640. "dec %%ecx \n"
  641. "jnz 2b \n"
  642. "adc %%ecx, %%ecx \n"
  643. "3: \n"
  644. : "=a" (dummy1), "=b" (dummy2), "=c" (c), "=d" (dummy3)
  645. : "1" (ss2), "2" (ss2_size), "3" (rest), "S" (ss1), "D" (result)
  646. : "cc", "memory" );
  647. #endif
  648. TTMATH_VECTOR_LOGC("UInt::SubVector", c, result, ss1_size)
  649. return c;
  650. }
  651. /*!
  652. this method moves all bits into the left hand side
  653. return value <- this <- c
  654. the lowest *bit* will be held the 'c' and
  655. the state of one additional bit (on the left hand side)
  656. will be returned
  657. for example:
  658. let this is 001010000
  659. after Rcl2_one(1) there'll be 010100001 and Rcl2_one returns 0
  660. */
  661. template<uint value_size>
  662. uint UInt<value_size>::Rcl2_one(uint c)
  663. {
  664. uint b = value_size;
  665. uint * p1 = table;
  666. #ifndef __GNUC__
  667. __asm
  668. {
  669. push ebx
  670. push ecx
  671. push edx
  672. mov ebx, [p1]
  673. xor edx, edx
  674. mov ecx, [c]
  675. neg ecx
  676. mov ecx, [b]
  677. ttmath_loop:
  678. rcl dword ptr [ebx+edx*4], 1
  679. inc edx
  680. dec ecx
  681. jnz ttmath_loop
  682. adc ecx, ecx
  683. mov [c], ecx
  684. pop edx
  685. pop ecx
  686. pop ebx
  687. }
  688. #endif
  689. #ifdef __GNUC__
  690. uint dummy, dummy2;
  691. __asm__ __volatile__(
  692. "xorl %%edx, %%edx \n" // edx=0
  693. "negl %%eax \n" // CF=1 if eax!=0 , CF=0 if eax==0
  694. "1: \n"
  695. "rcll $1, (%%ebx, %%edx, 4) \n"
  696. "incl %%edx \n"
  697. "decl %%ecx \n"
  698. "jnz 1b \n"
  699. "adcl %%ecx, %%ecx \n"
  700. : "=c" (c), "=a" (dummy), "=d" (dummy2)
  701. : "0" (b), "1" (c), "b" (p1)
  702. : "cc", "memory" );
  703. #endif
  704. TTMATH_LOGC("UInt::Rcl2_one", c)
  705. return c;
  706. }
  707. /*!
  708. this method moves all bits into the right hand side
  709. c -> this -> return value
  710. the highest *bit* will be held the 'c' and
  711. the state of one additional bit (on the right hand side)
  712. will be returned
  713. for example:
  714. let this is 000000010
  715. after Rcr2_one(1) there'll be 100000001 and Rcr2_one returns 0
  716. */
  717. template<uint value_size>
  718. uint UInt<value_size>::Rcr2_one(uint c)
  719. {
  720. uint b = value_size;
  721. uint * p1 = table;
  722. #ifndef __GNUC__
  723. __asm
  724. {
  725. push ebx
  726. push ecx
  727. mov ebx, [p1]
  728. mov ecx, [c]
  729. neg ecx
  730. mov ecx, [b]
  731. ttmath_loop:
  732. rcr dword ptr [ebx+ecx*4-4], 1
  733. dec ecx
  734. jnz ttmath_loop
  735. adc ecx, ecx
  736. mov [c], ecx
  737. pop ecx
  738. pop ebx
  739. }
  740. #endif
  741. #ifdef __GNUC__
  742. uint dummy;
  743. __asm__ __volatile__(
  744. "negl %%eax \n" // CF=1 if eax!=0 , CF=0 if eax==0
  745. "1: \n"
  746. "rcrl $1, -4(%%ebx, %%ecx, 4) \n"
  747. "decl %%ecx \n"
  748. "jnz 1b \n"
  749. "adcl %%ecx, %%ecx \n"
  750. : "=c" (c), "=a" (dummy)
  751. : "0" (b), "1" (c), "b" (p1)
  752. : "cc", "memory" );
  753. #endif
  754. TTMATH_LOGC("UInt::Rcr2_one", c)
  755. return c;
  756. }
  757. #ifdef _MSC_VER
  758. #pragma warning (disable : 4731)
  759. //warning C4731: frame pointer register 'ebp' modified by inline assembly code
  760. #endif
  761. /*!
  762. this method moves all bits into the left hand side
  763. return value <- this <- c
  764. the lowest *bits* will be held the 'c' and
  765. the state of one additional bit (on the left hand side)
  766. will be returned
  767. for example:
  768. let this is 001010000
  769. after Rcl2(3, 1) there'll be 010000111 and Rcl2 returns 1
  770. */
  771. template<uint value_size>
  772. uint UInt<value_size>::Rcl2(uint bits, uint c)
  773. {
  774. TTMATH_ASSERT( bits>0 && bits<TTMATH_BITS_PER_UINT )
  775. uint b = value_size;
  776. uint * p1 = table;
  777. #ifndef __GNUC__
  778. __asm
  779. {
  780. push eax
  781. push ebx
  782. push ecx
  783. push edx
  784. push esi
  785. push edi
  786. push ebp
  787. mov edi, [b]
  788. mov ecx, 32
  789. sub ecx, [bits]
  790. mov edx, -1
  791. shr edx, cl
  792. mov ecx, [bits]
  793. mov ebx, [p1]
  794. mov eax, [c]
  795. mov ebp, edx // ebp = mask (modified ebp - don't read/write to variables)
  796. xor edx, edx // edx = 0
  797. mov esi, edx
  798. or eax, eax
  799. cmovnz esi, ebp // if(c) esi=mask else esi=0
  800. ttmath_loop:
  801. rol dword ptr [ebx+edx*4], cl
  802. mov eax, [ebx+edx*4]
  803. and eax, ebp
  804. xor [ebx+edx*4], eax // clearing bits
  805. or [ebx+edx*4], esi // saving old value
  806. mov esi, eax
  807. inc edx
  808. dec edi
  809. jnz ttmath_loop
  810. pop ebp // restoring ebp
  811. and eax, 1
  812. mov [c], eax
  813. pop edi
  814. pop esi
  815. pop edx
  816. pop ecx
  817. pop ebx
  818. pop eax
  819. }
  820. #endif
  821. #ifdef __GNUC__
  822. uint dummy, dummy2, dummy3;
  823. __asm__ __volatile__(
  824. "push %%ebp \n"
  825. "movl %%ecx, %%esi \n"
  826. "movl $32, %%ecx \n"
  827. "subl %%esi, %%ecx \n" // ecx = 32 - bits
  828. "movl $-1, %%edx \n" // edx = -1 (all bits set to one)
  829. "shrl %%cl, %%edx \n" // shifting (0 -> edx -> cf) (cl times)
  830. "movl %%edx, %%ebp \n" // ebp = edx = mask
  831. "movl %%esi, %%ecx \n"
  832. "xorl %%edx, %%edx \n"
  833. "movl %%edx, %%esi \n"
  834. "orl %%eax, %%eax \n"
  835. "cmovnz %%ebp, %%esi \n" // if(c) esi=mask else esi=0
  836. "1: \n"
  837. "roll %%cl, (%%ebx,%%edx,4) \n"
  838. "movl (%%ebx,%%edx,4), %%eax \n"
  839. "andl %%ebp, %%eax \n"
  840. "xorl %%eax, (%%ebx,%%edx,4) \n"
  841. "orl %%esi, (%%ebx,%%edx,4) \n"
  842. "movl %%eax, %%esi \n"
  843. "incl %%edx \n"
  844. "decl %%edi \n"
  845. "jnz 1b \n"
  846. "and $1, %%eax \n"
  847. "pop %%ebp \n"
  848. : "=a" (c), "=D" (dummy), "=S" (dummy2), "=d" (dummy3)
  849. : "0" (c), "1" (b), "b" (p1), "c" (bits)
  850. : "cc", "memory" );
  851. #endif
  852. TTMATH_LOGC("UInt::Rcl2", c)
  853. return c;
  854. }
  855. /*!
  856. this method moves all bits into the right hand side
  857. C -> this -> return value
  858. the highest *bits* will be held the 'c' and
  859. the state of one additional bit (on the right hand side)
  860. will be returned
  861. for example:
  862. let this is 000000010
  863. after Rcr2(2, 1) there'll be 110000000 and Rcr2 returns 1
  864. */
  865. template<uint value_size>
  866. uint UInt<value_size>::Rcr2(uint bits, uint c)
  867. {
  868. TTMATH_ASSERT( bits>0 && bits<TTMATH_BITS_PER_UINT )
  869. uint b = value_size;
  870. uint * p1 = table;
  871. #ifndef __GNUC__
  872. __asm
  873. {
  874. push eax
  875. push ebx
  876. push ecx
  877. push edx
  878. push esi
  879. push edi
  880. push ebp
  881. mov edi, [b]
  882. mov ecx, 32
  883. sub ecx, [bits]
  884. mov edx, -1
  885. shl edx, cl
  886. mov ecx, [bits]
  887. mov ebx, [p1]
  888. mov eax, [c]
  889. mov ebp, edx // ebp = mask (modified ebp - don't read/write to variables)
  890. xor edx, edx // edx = 0
  891. mov esi, edx
  892. add edx, edi
  893. dec edx // edx is pointing at the end of the table (on last word)
  894. or eax, eax
  895. cmovnz esi, ebp // if(c) esi=mask else esi=0
  896. ttmath_loop:
  897. ror dword ptr [ebx+edx*4], cl
  898. mov eax, [ebx+edx*4]
  899. and eax, ebp
  900. xor [ebx+edx*4], eax // clearing bits
  901. or [ebx+edx*4], esi // saving old value
  902. mov esi, eax
  903. dec edx
  904. dec edi
  905. jnz ttmath_loop
  906. pop ebp // restoring ebp
  907. rol eax, 1 // 31bit will be first
  908. and eax, 1
  909. mov [c], eax
  910. pop edi
  911. pop esi
  912. pop edx
  913. pop ecx
  914. pop ebx
  915. pop eax
  916. }
  917. #endif
  918. #ifdef __GNUC__
  919. uint dummy, dummy2, dummy3;
  920. __asm__ __volatile__(
  921. "push %%ebp \n"
  922. "movl %%ecx, %%esi \n"
  923. "movl $32, %%ecx \n"
  924. "subl %%esi, %%ecx \n" // ecx = 32 - bits
  925. "movl $-1, %%edx \n" // edx = -1 (all bits set to one)
  926. "shll %%cl, %%edx \n" // shifting (cf <- edx <- 0) (cl times)
  927. "movl %%edx, %%ebp \n" // ebp = edx = mask
  928. "movl %%esi, %%ecx \n"
  929. "xorl %%edx, %%edx \n"
  930. "movl %%edx, %%esi \n"
  931. "addl %%edi, %%edx \n"
  932. "decl %%edx \n" // edx is pointing at the end of the table (on last word)
  933. "orl %%eax, %%eax \n"
  934. "cmovnz %%ebp, %%esi \n" // if(c) esi=mask else esi=0
  935. "1: \n"
  936. "rorl %%cl, (%%ebx,%%edx,4) \n"
  937. "movl (%%ebx,%%edx,4), %%eax \n"
  938. "andl %%ebp, %%eax \n"
  939. "xorl %%eax, (%%ebx,%%edx,4) \n"
  940. "orl %%esi, (%%ebx,%%edx,4) \n"
  941. "movl %%eax, %%esi \n"
  942. "decl %%edx \n"
  943. "decl %%edi \n"
  944. "jnz 1b \n"
  945. "roll $1, %%eax \n"
  946. "andl $1, %%eax \n"
  947. "pop %%ebp \n"
  948. : "=a" (c), "=D" (dummy), "=S" (dummy2), "=d" (dummy3)
  949. : "0" (c), "1" (b), "b" (p1), "c" (bits)
  950. : "cc", "memory" );
  951. #endif
  952. TTMATH_LOGC("UInt::Rcr2", c)
  953. return c;
  954. }
  955. #ifdef _MSC_VER
  956. #pragma warning (default : 4731)
  957. #endif
  958. /*
  959. this method returns the number of the highest set bit in one 32-bit word
  960. if the 'x' is zero this method returns '-1'
  961. */
  962. template<uint value_size>
  963. sint UInt<value_size>::FindLeadingBitInWord(uint x)
  964. {
  965. sint result;
  966. #ifndef __GNUC__
  967. __asm
  968. {
  969. push eax
  970. push edx
  971. mov edx,-1
  972. bsr eax,[x]
  973. cmovz eax,edx
  974. mov [result], eax
  975. pop edx
  976. pop eax
  977. }
  978. #endif
  979. #ifdef __GNUC__
  980. uint dummy;
  981. __asm__ (
  982. "movl $-1, %1 \n"
  983. "bsrl %2, %0 \n"
  984. "cmovz %1, %0 \n"
  985. : "=r" (result), "=&r" (dummy)
  986. : "r" (x)
  987. : "cc" );
  988. #endif
  989. return result;
  990. }
  991. /*
  992. this method returns the number of the smallest set bit in one 32-bit word
  993. if the 'x' is zero this method returns '-1'
  994. */
  995. template<uint value_size>
  996. sint UInt<value_size>::FindLowestBitInWord(uint x)
  997. {
  998. sint result;
  999. #ifndef __GNUC__
  1000. __asm
  1001. {
  1002. push eax
  1003. push edx
  1004. mov edx,-1
  1005. bsf eax,[x]
  1006. cmovz eax,edx
  1007. mov [result], eax
  1008. pop edx
  1009. pop eax
  1010. }
  1011. #endif
  1012. #ifdef __GNUC__
  1013. uint dummy;
  1014. __asm__ (
  1015. "movl $-1, %1 \n"
  1016. "bsfl %2, %0 \n"
  1017. "cmovz %1, %0 \n"
  1018. : "=r" (result), "=&r" (dummy)
  1019. : "r" (x)
  1020. : "cc" );
  1021. #endif
  1022. return result;
  1023. }
  1024. /*!
  1025. this method sets a special bit in the 'value'
  1026. and returns the last state of the bit (zero or one)
  1027. bit is from <0,31>
  1028. e.g.
  1029. uint x = 100;
  1030. uint bit = SetBitInWord(x, 3);
  1031. now: x = 108 and bit = 0
  1032. */
  1033. template<uint value_size>
  1034. uint UInt<value_size>::SetBitInWord(uint & value, uint bit)
  1035. {
  1036. TTMATH_ASSERT( bit < TTMATH_BITS_PER_UINT )
  1037. uint old_bit;
  1038. uint v = value;
  1039. #ifndef __GNUC__
  1040. __asm
  1041. {
  1042. push ebx
  1043. push eax
  1044. mov eax, [v]
  1045. mov ebx, [bit]
  1046. bts eax, ebx
  1047. mov [v], eax
  1048. setc bl
  1049. movzx ebx, bl
  1050. mov [old_bit], ebx
  1051. pop eax
  1052. pop ebx
  1053. }
  1054. #endif
  1055. #ifdef __GNUC__
  1056. __asm__ (
  1057. "btsl %%ebx, %%eax \n"
  1058. "setc %%bl \n"
  1059. "movzx %%bl, %%ebx \n"
  1060. : "=a" (v), "=b" (old_bit)
  1061. : "0" (v), "1" (bit)
  1062. : "cc" );
  1063. #endif
  1064. value = v;
  1065. return old_bit;
  1066. }
  1067. /*!
  1068. multiplication: result_high:result_low = a * b
  1069. result_high - higher word of the result
  1070. result_low - lower word of the result
  1071. this methos never returns a carry
  1072. this method is used in the second version of the multiplication algorithms
  1073. */
  1074. template<uint value_size>
  1075. void UInt<value_size>::MulTwoWords(uint a, uint b, uint * result_high, uint * result_low)
  1076. {
  1077. /*
  1078. we must use these temporary variables in order to inform the compilator
  1079. that value pointed with result1 and result2 has changed
  1080. this has no effect in visual studio but it's useful when
  1081. using gcc and options like -Ox
  1082. */
  1083. uint result1_;
  1084. uint result2_;
  1085. #ifndef __GNUC__
  1086. __asm
  1087. {
  1088. push eax
  1089. push edx
  1090. mov eax, [a]
  1091. mul dword ptr [b]
  1092. mov [result2_], edx
  1093. mov [result1_], eax
  1094. pop edx
  1095. pop eax
  1096. }
  1097. #endif
  1098. #ifdef __GNUC__
  1099. __asm__ (
  1100. "mull %%edx \n"
  1101. : "=a" (result1_), "=d" (result2_)
  1102. : "0" (a), "1" (b)
  1103. : "cc" );
  1104. #endif
  1105. *result_low = result1_;
  1106. *result_high = result2_;
  1107. }
  1108. /*!
  1109. *
  1110. * Division
  1111. *
  1112. *
  1113. */
  1114. /*!
  1115. this method calculates 64bits word a:b / 32bits c (a higher, b lower word)
  1116. r = a:b / c and rest - remainder
  1117. *
  1118. * WARNING:
  1119. * if r (one word) is too small for the result or c is equal zero
  1120. * there'll be a hardware interruption (0)
  1121. * and probably the end of your program
  1122. *
  1123. */
  1124. template<uint value_size>
  1125. void UInt<value_size>::DivTwoWords(uint a, uint b, uint c, uint * r, uint * rest)
  1126. {
  1127. uint r_;
  1128. uint rest_;
  1129. /*
  1130. these variables have similar meaning like those in
  1131. the multiplication algorithm MulTwoWords
  1132. */
  1133. TTMATH_ASSERT( c != 0 )
  1134. #ifndef __GNUC__
  1135. __asm
  1136. {
  1137. push eax
  1138. push edx
  1139. mov edx, [a]
  1140. mov eax, [b]
  1141. div dword ptr [c]
  1142. mov [r_], eax
  1143. mov [rest_], edx
  1144. pop edx
  1145. pop eax
  1146. }
  1147. #endif
  1148. #ifdef __GNUC__
  1149. __asm__ (
  1150. "divl %%ecx \n"
  1151. : "=a" (r_), "=d" (rest_)
  1152. : "0" (b), "1" (a), "c" (c)
  1153. : "cc" );
  1154. #endif
  1155. *r = r_;
  1156. *rest = rest_;
  1157. }
  1158. } //namespace
  1159. #endif //ifdef TTMATH_PLATFORM32
  1160. #endif //ifndef TTMATH_NOASM
  1161. #endif