private.odin 88 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402
  1. /*
  2. Copyright 2021 Jeroen van Rijn <[email protected]>.
  3. Made available under Odin's BSD-3 license.
  4. An arbitrary precision mathematics implementation in Odin.
  5. For the theoretical underpinnings, see Knuth's The Art of Computer Programming, Volume 2, section 4.3.
  6. The code started out as an idiomatic source port of libTomMath, which is in the public domain, with thanks.
  7. ============================= Private procedures =============================
  8. Private procedures used by the above low-level routines follow.
  9. Don't call these yourself unless you really know what you're doing.
  10. They include implementations that are optimimal for certain ranges of input only.
  11. These aren't exported for the same reasons.
  12. */
  13. package math_big
  14. import "core:intrinsics"
  15. import "core:mem"
  16. /*
  17. Multiplies |a| * |b| and only computes upto digs digits of result.
  18. HAC pp. 595, Algorithm 14.12 Modified so you can control how
  19. many digits of output are created.
  20. */
  21. _private_int_mul :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
  22. context.allocator = allocator
  23. /*
  24. Can we use the fast multiplier?
  25. */
  26. if digits < _WARRAY && min(a.used, b.used) < _MAX_COMBA {
  27. return #force_inline _private_int_mul_comba(dest, a, b, digits)
  28. }
  29. /*
  30. Set up temporary output `Int`, which we'll swap for `dest` when done.
  31. */
  32. t := &Int{}
  33. internal_grow(t, max(digits, _DEFAULT_DIGIT_COUNT)) or_return
  34. t.used = digits
  35. /*
  36. Compute the digits of the product directly.
  37. */
  38. pa := a.used
  39. for ix := 0; ix < pa; ix += 1 {
  40. /*
  41. Limit ourselves to `digits` DIGITs of output.
  42. */
  43. pb := min(b.used, digits - ix)
  44. carry := _WORD(0)
  45. iy := 0
  46. /*
  47. Compute the column of the output and propagate the carry.
  48. */
  49. #no_bounds_check for iy = 0; iy < pb; iy += 1 {
  50. /*
  51. Compute the column as a _WORD.
  52. */
  53. column := _WORD(t.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + carry
  54. /*
  55. The new column is the lower part of the result.
  56. */
  57. t.digit[ix + iy] = DIGIT(column & _WORD(_MASK))
  58. /*
  59. Get the carry word from the result.
  60. */
  61. carry = column >> _DIGIT_BITS
  62. }
  63. /*
  64. Set carry if it is placed below digits
  65. */
  66. if ix + iy < digits {
  67. t.digit[ix + pb] = DIGIT(carry)
  68. }
  69. }
  70. internal_swap(dest, t)
  71. internal_destroy(t)
  72. return internal_clamp(dest)
  73. }
  74. /*
  75. Multiplication using the Toom-Cook 3-way algorithm.
  76. Much more complicated than Karatsuba but has a lower asymptotic running time of O(N**1.464).
  77. This algorithm is only particularly useful on VERY large inputs.
  78. (We're talking 1000s of digits here...).
  79. This file contains code from J. Arndt's book "Matters Computational"
  80. and the accompanying FXT-library with permission of the author.
  81. Setup from:
  82. Chung, Jaewook, and M. Anwar Hasan. "Asymmetric squaring formulae."
  83. 18th IEEE Symposium on Computer Arithmetic (ARITH'07). IEEE, 2007.
  84. The interpolation from above needed one temporary variable more than the interpolation here:
  85. Bodrato, Marco, and Alberto Zanoni. "What about Toom-Cook matrices optimality."
  86. Centro Vito Volterra Universita di Roma Tor Vergata (2006)
  87. */
  88. _private_int_mul_toom :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
  89. context.allocator = allocator
  90. S1, S2, T1, a0, a1, a2, b0, b1, b2 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
  91. defer internal_destroy(S1, S2, T1, a0, a1, a2, b0, b1, b2)
  92. /*
  93. Init temps.
  94. */
  95. internal_init_multi(S1, S2, T1) or_return
  96. /*
  97. B
  98. */
  99. B := min(a.used, b.used) / 3
  100. /*
  101. a = a2 * x^2 + a1 * x + a0;
  102. */
  103. internal_grow(a0, B) or_return
  104. internal_grow(a1, B) or_return
  105. internal_grow(a2, a.used - 2 * B) or_return
  106. a0.used, a1.used = B, B
  107. a2.used = a.used - 2 * B
  108. internal_copy_digits(a0, a, a0.used) or_return
  109. internal_copy_digits(a1, a, a1.used, B) or_return
  110. internal_copy_digits(a2, a, a2.used, 2 * B) or_return
  111. internal_clamp(a0)
  112. internal_clamp(a1)
  113. internal_clamp(a2)
  114. /*
  115. b = b2 * x^2 + b1 * x + b0;
  116. */
  117. internal_grow(b0, B) or_return
  118. internal_grow(b1, B) or_return
  119. internal_grow(b2, b.used - 2 * B) or_return
  120. b0.used, b1.used = B, B
  121. b2.used = b.used - 2 * B
  122. internal_copy_digits(b0, b, b0.used) or_return
  123. internal_copy_digits(b1, b, b1.used, B) or_return
  124. internal_copy_digits(b2, b, b2.used, 2 * B) or_return
  125. internal_clamp(b0)
  126. internal_clamp(b1)
  127. internal_clamp(b2)
  128. /*
  129. \\ S1 = (a2+a1+a0) * (b2+b1+b0);
  130. */
  131. internal_add(T1, a2, a1) or_return /* T1 = a2 + a1; */
  132. internal_add(S2, T1, a0) or_return /* S2 = T1 + a0; */
  133. internal_add(dest, b2, b1) or_return /* dest = b2 + b1; */
  134. internal_add(S1, dest, b0) or_return /* S1 = c + b0; */
  135. internal_mul(S1, S1, S2) or_return /* S1 = S1 * S2; */
  136. /*
  137. \\S2 = (4*a2+2*a1+a0) * (4*b2+2*b1+b0);
  138. */
  139. internal_add(T1, T1, a2) or_return /* T1 = T1 + a2; */
  140. internal_int_shl1(T1, T1) or_return /* T1 = T1 << 1; */
  141. internal_add(T1, T1, a0) or_return /* T1 = T1 + a0; */
  142. internal_add(dest, dest, b2) or_return /* c = c + b2; */
  143. internal_int_shl1(dest, dest) or_return /* c = c << 1; */
  144. internal_add(dest, dest, b0) or_return /* c = c + b0; */
  145. internal_mul(S2, T1, dest) or_return /* S2 = T1 * c; */
  146. /*
  147. \\S3 = (a2-a1+a0) * (b2-b1+b0);
  148. */
  149. internal_sub(a1, a2, a1) or_return /* a1 = a2 - a1; */
  150. internal_add(a1, a1, a0) or_return /* a1 = a1 + a0; */
  151. internal_sub(b1, b2, b1) or_return /* b1 = b2 - b1; */
  152. internal_add(b1, b1, b0) or_return /* b1 = b1 + b0; */
  153. internal_mul(a1, a1, b1) or_return /* a1 = a1 * b1; */
  154. internal_mul(b1, a2, b2) or_return /* b1 = a2 * b2; */
  155. /*
  156. \\S2 = (S2 - S3) / 3;
  157. */
  158. internal_sub(S2, S2, a1) or_return /* S2 = S2 - a1; */
  159. _private_int_div_3(S2, S2) or_return /* S2 = S2 / 3; \\ this is an exact division */
  160. internal_sub(a1, S1, a1) or_return /* a1 = S1 - a1; */
  161. internal_int_shr1(a1, a1) or_return /* a1 = a1 >> 1; */
  162. internal_mul(a0, a0, b0) or_return /* a0 = a0 * b0; */
  163. internal_sub(S1, S1, a0) or_return /* S1 = S1 - a0; */
  164. internal_sub(S2, S2, S1) or_return /* S2 = S2 - S1; */
  165. internal_int_shr1(S2, S2) or_return /* S2 = S2 >> 1; */
  166. internal_sub(S1, S1, a1) or_return /* S1 = S1 - a1; */
  167. internal_sub(S1, S1, b1) or_return /* S1 = S1 - b1; */
  168. internal_int_shl1(T1, b1) or_return /* T1 = b1 << 1; */
  169. internal_sub(S2, S2, T1) or_return /* S2 = S2 - T1; */
  170. internal_sub(a1, a1, S2) or_return /* a1 = a1 - S2; */
  171. /*
  172. P = b1*x^4+ S2*x^3+ S1*x^2+ a1*x + a0;
  173. */
  174. _private_int_shl_leg(b1, 4 * B) or_return
  175. _private_int_shl_leg(S2, 3 * B) or_return
  176. internal_add(b1, b1, S2) or_return
  177. _private_int_shl_leg(S1, 2 * B) or_return
  178. internal_add(b1, b1, S1) or_return
  179. _private_int_shl_leg(a1, 1 * B) or_return
  180. internal_add(b1, b1, a1) or_return
  181. internal_add(dest, b1, a0) or_return
  182. /*
  183. a * b - P
  184. */
  185. return nil
  186. }
  187. /*
  188. product = |a| * |b| using Karatsuba Multiplication using three half size multiplications.
  189. Let `B` represent the radix [e.g. 2**_DIGIT_BITS] and let `n` represent
  190. half of the number of digits in the min(a,b)
  191. `a` = `a1` * `B`**`n` + `a0`
  192. `b` = `b`1 * `B`**`n` + `b0`
  193. Then, a * b => 1b1 * B**2n + ((a1 + a0)(b1 + b0) - (a0b0 + a1b1)) * B + a0b0
  194. Note that a1b1 and a0b0 are used twice and only need to be computed once.
  195. So in total three half size (half # of digit) multiplications are performed,
  196. a0b0, a1b1 and (a1+b1)(a0+b0)
  197. Note that a multiplication of half the digits requires 1/4th the number of
  198. single precision multiplications, so in total after one call 25% of the
  199. single precision multiplications are saved.
  200. Note also that the call to `internal_mul` can end up back in this function
  201. if the a0, a1, b0, or b1 are above the threshold.
  202. This is known as divide-and-conquer and leads to the famous O(N**lg(3)) or O(N**1.584)
  203. work which is asymptopically lower than the standard O(N**2) that the
  204. baseline/comba methods use. Generally though, the overhead of this method doesn't pay off
  205. until a certain size is reached, of around 80 used DIGITs.
  206. */
  207. _private_int_mul_karatsuba :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
  208. context.allocator = allocator
  209. x0, x1, y0, y1, t1, x0y0, x1y1 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
  210. defer internal_destroy(x0, x1, y0, y1, t1, x0y0, x1y1)
  211. /*
  212. min # of digits, divided by two.
  213. */
  214. B := min(a.used, b.used) >> 1
  215. /*
  216. Init all the temps.
  217. */
  218. internal_grow(x0, B) or_return
  219. internal_grow(x1, a.used - B) or_return
  220. internal_grow(y0, B) or_return
  221. internal_grow(y1, b.used - B) or_return
  222. internal_grow(t1, B * 2) or_return
  223. internal_grow(x0y0, B * 2) or_return
  224. internal_grow(x1y1, B * 2) or_return
  225. /*
  226. Now shift the digits.
  227. */
  228. x0.used, y0.used = B, B
  229. x1.used = a.used - B
  230. y1.used = b.used - B
  231. /*
  232. We copy the digits directly instead of using higher level functions
  233. since we also need to shift the digits.
  234. */
  235. internal_copy_digits(x0, a, x0.used)
  236. internal_copy_digits(y0, b, y0.used)
  237. internal_copy_digits(x1, a, x1.used, B)
  238. internal_copy_digits(y1, b, y1.used, B)
  239. /*
  240. Only need to clamp the lower words since by definition the
  241. upper words x1/y1 must have a known number of digits.
  242. */
  243. clamp(x0)
  244. clamp(y0)
  245. /*
  246. Now calc the products x0y0 and x1y1,
  247. after this x0 is no longer required, free temp [x0==t2]!
  248. */
  249. internal_mul(x0y0, x0, y0) or_return /* x0y0 = x0*y0 */
  250. internal_mul(x1y1, x1, y1) or_return /* x1y1 = x1*y1 */
  251. internal_add(t1, x1, x0) or_return /* now calc x1+x0 and */
  252. internal_add(x0, y1, y0) or_return /* t2 = y1 + y0 */
  253. internal_mul(t1, t1, x0) or_return /* t1 = (x1 + x0) * (y1 + y0) */
  254. /*
  255. Add x0y0.
  256. */
  257. internal_add(x0, x0y0, x1y1) or_return /* t2 = x0y0 + x1y1 */
  258. internal_sub(t1, t1, x0) or_return /* t1 = (x1+x0)*(y1+y0) - (x1y1 + x0y0) */
  259. /*
  260. shift by B.
  261. */
  262. _private_int_shl_leg(t1, B) or_return /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
  263. _private_int_shl_leg(x1y1, B * 2) or_return /* x1y1 = x1y1 << 2*B */
  264. internal_add(t1, x0y0, t1) or_return /* t1 = x0y0 + t1 */
  265. internal_add(dest, t1, x1y1) or_return /* t1 = x0y0 + t1 + x1y1 */
  266. return nil
  267. }
  268. /*
  269. Fast (comba) multiplier
  270. This is the fast column-array [comba] multiplier. It is
  271. designed to compute the columns of the product first
  272. then handle the carries afterwards. This has the effect
  273. of making the nested loops that compute the columns very
  274. simple and schedulable on super-scalar processors.
  275. This has been modified to produce a variable number of
  276. digits of output so if say only a half-product is required
  277. you don't have to compute the upper half (a feature
  278. required for fast Barrett reduction).
  279. Based on Algorithm 14.12 on pp.595 of HAC.
  280. */
  281. _private_int_mul_comba :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
  282. context.allocator = allocator
  283. /*
  284. Set up array.
  285. */
  286. W: [_WARRAY]DIGIT = ---
  287. /*
  288. Grow the destination as required.
  289. */
  290. internal_grow(dest, digits) or_return
  291. /*
  292. Number of output digits to produce.
  293. */
  294. pa := min(digits, a.used + b.used)
  295. /*
  296. Clear the carry
  297. */
  298. _W := _WORD(0)
  299. ix: int
  300. for ix = 0; ix < pa; ix += 1 {
  301. tx, ty, iy, iz: int
  302. /*
  303. Get offsets into the two bignums.
  304. */
  305. ty = min(b.used - 1, ix)
  306. tx = ix - ty
  307. /*
  308. This is the number of times the loop will iterate, essentially.
  309. while (tx++ < a->used && ty-- >= 0) { ... }
  310. */
  311. iy = min(a.used - tx, ty + 1)
  312. /*
  313. Execute loop.
  314. */
  315. #no_bounds_check for iz = 0; iz < iy; iz += 1 {
  316. _W += _WORD(a.digit[tx + iz]) * _WORD(b.digit[ty - iz])
  317. }
  318. /*
  319. Store term.
  320. */
  321. W[ix] = DIGIT(_W) & _MASK
  322. /*
  323. Make next carry.
  324. */
  325. _W = _W >> _WORD(_DIGIT_BITS)
  326. }
  327. /*
  328. Setup dest.
  329. */
  330. old_used := dest.used
  331. dest.used = pa
  332. /*
  333. Now extract the previous digit [below the carry].
  334. */
  335. copy_slice(dest.digit[0:], W[:pa])
  336. /*
  337. Clear unused digits [that existed in the old copy of dest].
  338. */
  339. internal_zero_unused(dest, old_used)
  340. /*
  341. Adjust dest.used based on leading zeroes.
  342. */
  343. return internal_clamp(dest)
  344. }
  345. /*
  346. Multiplies |a| * |b| and does not compute the lower digs digits
  347. [meant to get the higher part of the product]
  348. */
  349. _private_int_mul_high :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
  350. context.allocator = allocator
  351. /*
  352. Can we use the fast multiplier?
  353. */
  354. if a.used + b.used + 1 < _WARRAY && min(a.used, b.used) < _MAX_COMBA {
  355. return _private_int_mul_high_comba(dest, a, b, digits)
  356. }
  357. internal_grow(dest, a.used + b.used + 1) or_return
  358. dest.used = a.used + b.used + 1
  359. pa := a.used
  360. pb := b.used
  361. for ix := 0; ix < pa; ix += 1 {
  362. carry := DIGIT(0)
  363. for iy := digits - ix; iy < pb; iy += 1 {
  364. /*
  365. Calculate the double precision result.
  366. */
  367. r := _WORD(dest.digit[ix + iy]) + _WORD(a.digit[ix]) * _WORD(b.digit[iy]) + _WORD(carry)
  368. /*
  369. Get the lower part.
  370. */
  371. dest.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
  372. /*
  373. Carry the carry.
  374. */
  375. carry = DIGIT(r >> _WORD(_DIGIT_BITS))
  376. }
  377. dest.digit[ix + pb] = carry
  378. }
  379. return internal_clamp(dest)
  380. }
  381. /*
  382. This is a modified version of `_private_int_mul_comba` that only produces output digits *above* `digits`.
  383. See the comments for `_private_int_mul_comba` to see how it works.
  384. This is used in the Barrett reduction since for one of the multiplications
  385. only the higher digits were needed. This essentially halves the work.
  386. Based on Algorithm 14.12 on pp.595 of HAC.
  387. */
  388. _private_int_mul_high_comba :: proc(dest, a, b: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
  389. context.allocator = allocator
  390. W: [_WARRAY]DIGIT = ---
  391. _W: _WORD = 0
  392. /*
  393. Number of output digits to produce. Grow the destination as required.
  394. */
  395. pa := a.used + b.used
  396. internal_grow(dest, pa) or_return
  397. ix: int
  398. for ix = digits; ix < pa; ix += 1 {
  399. /*
  400. Get offsets into the two bignums.
  401. */
  402. ty := min(b.used - 1, ix)
  403. tx := ix - ty
  404. /*
  405. This is the number of times the loop will iterrate, essentially it's
  406. while (tx++ < a->used && ty-- >= 0) { ... }
  407. */
  408. iy := min(a.used - tx, ty + 1)
  409. /*
  410. Execute loop.
  411. */
  412. for iz := 0; iz < iy; iz += 1 {
  413. _W += _WORD(a.digit[tx + iz]) * _WORD(b.digit[ty - iz])
  414. }
  415. /*
  416. Store term.
  417. */
  418. W[ix] = DIGIT(_W) & DIGIT(_MASK)
  419. /*
  420. Make next carry.
  421. */
  422. _W = _W >> _WORD(_DIGIT_BITS)
  423. }
  424. /*
  425. Setup dest
  426. */
  427. old_used := dest.used
  428. dest.used = pa
  429. for ix = digits; ix < pa; ix += 1 {
  430. /*
  431. Now extract the previous digit [below the carry].
  432. */
  433. dest.digit[ix] = W[ix]
  434. }
  435. /*
  436. Zero remainder.
  437. */
  438. internal_zero_unused(dest, old_used)
  439. /*
  440. Adjust dest.used based on leading zeroes.
  441. */
  442. return internal_clamp(dest)
  443. }
  444. /*
  445. Single-digit multiplication with the smaller number as the single-digit.
  446. */
  447. _private_int_mul_balance :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
  448. context.allocator = allocator
  449. a, b := a, b
  450. a0, tmp, r := &Int{}, &Int{}, &Int{}
  451. defer internal_destroy(a0, tmp, r)
  452. b_size := min(a.used, b.used)
  453. n_blocks := max(a.used, b.used) / b_size
  454. internal_grow(a0, b_size + 2) or_return
  455. internal_init_multi(tmp, r) or_return
  456. /*
  457. Make sure that `a` is the larger one.
  458. */
  459. if a.used < b.used {
  460. a, b = b, a
  461. }
  462. assert(a.used >= b.used)
  463. i, j := 0, 0
  464. for ; i < n_blocks; i += 1 {
  465. /*
  466. Cut a slice off of `a`.
  467. */
  468. a0.used = b_size
  469. internal_copy_digits(a0, a, a0.used, j)
  470. j += a0.used
  471. internal_clamp(a0)
  472. /*
  473. Multiply with `b`.
  474. */
  475. internal_mul(tmp, a0, b) or_return
  476. /*
  477. Shift `tmp` to the correct position.
  478. */
  479. _private_int_shl_leg(tmp, b_size * i) or_return
  480. /*
  481. Add to output. No carry needed.
  482. */
  483. internal_add(r, r, tmp) or_return
  484. }
  485. /*
  486. The left-overs; there are always left-overs.
  487. */
  488. if j < a.used {
  489. a0.used = a.used - j
  490. internal_copy_digits(a0, a, a0.used, j)
  491. j += a0.used
  492. internal_clamp(a0)
  493. internal_mul(tmp, a0, b) or_return
  494. _private_int_shl_leg(tmp, b_size * i) or_return
  495. internal_add(r, r, tmp) or_return
  496. }
  497. internal_swap(dest, r)
  498. return
  499. }
  500. /*
  501. Low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16
  502. Assumes `dest` and `src` to not be `nil`, and `src` to have been initialized.
  503. */
  504. _private_int_sqr :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
  505. context.allocator = allocator
  506. pa := src.used
  507. t := &Int{}; ix, iy: int
  508. /*
  509. Grow `t` to maximum needed size, or `_DEFAULT_DIGIT_COUNT`, whichever is bigger.
  510. */
  511. internal_grow(t, max((2 * pa) + 1, _DEFAULT_DIGIT_COUNT)) or_return
  512. t.used = (2 * pa) + 1
  513. #no_bounds_check for ix = 0; ix < pa; ix += 1 {
  514. carry := DIGIT(0)
  515. /*
  516. First calculate the digit at 2*ix; calculate double precision result.
  517. */
  518. r := _WORD(t.digit[ix+ix]) + (_WORD(src.digit[ix]) * _WORD(src.digit[ix]))
  519. /*
  520. Store lower part in result.
  521. */
  522. t.digit[ix+ix] = DIGIT(r & _WORD(_MASK))
  523. /*
  524. Get the carry.
  525. */
  526. carry = DIGIT(r >> _DIGIT_BITS)
  527. #no_bounds_check for iy = ix + 1; iy < pa; iy += 1 {
  528. /*
  529. First calculate the product.
  530. */
  531. r = _WORD(src.digit[ix]) * _WORD(src.digit[iy])
  532. /* Now calculate the double precision result. Nóte we use
  533. * addition instead of *2 since it's easier to optimize
  534. */
  535. r = _WORD(t.digit[ix+iy]) + r + r + _WORD(carry)
  536. /*
  537. Store lower part.
  538. */
  539. t.digit[ix+iy] = DIGIT(r & _WORD(_MASK))
  540. /*
  541. Get carry.
  542. */
  543. carry = DIGIT(r >> _DIGIT_BITS)
  544. }
  545. /*
  546. Propagate upwards.
  547. */
  548. #no_bounds_check for carry != 0 {
  549. r = _WORD(t.digit[ix+iy]) + _WORD(carry)
  550. t.digit[ix+iy] = DIGIT(r & _WORD(_MASK))
  551. carry = DIGIT(r >> _WORD(_DIGIT_BITS))
  552. iy += 1
  553. }
  554. }
  555. err = internal_clamp(t)
  556. internal_swap(dest, t)
  557. internal_destroy(t)
  558. return err
  559. }
  560. /*
  561. The jist of squaring...
  562. You do like mult except the offset of the tmpx [one that starts closer to zero] can't equal the offset of tmpy.
  563. So basically you set up iy like before then you min it with (ty-tx) so that it never happens.
  564. You double all those you add in the inner loop. After that loop you do the squares and add them in.
  565. Assumes `dest` and `src` not to be `nil` and `src` to have been initialized.
  566. */
  567. _private_int_sqr_comba :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
  568. context.allocator = allocator
  569. W: [_WARRAY]DIGIT = ---
  570. /*
  571. Grow the destination as required.
  572. */
  573. pa := uint(src.used) + uint(src.used)
  574. internal_grow(dest, int(pa)) or_return
  575. /*
  576. Number of output digits to produce.
  577. */
  578. W1 := _WORD(0)
  579. _W : _WORD = ---
  580. ix := uint(0)
  581. #no_bounds_check for ; ix < pa; ix += 1 {
  582. /*
  583. Clear counter.
  584. */
  585. _W = {}
  586. /*
  587. Get offsets into the two bignums.
  588. */
  589. ty := min(uint(src.used) - 1, ix)
  590. tx := ix - ty
  591. /*
  592. This is the number of times the loop will iterate,
  593. essentially while (tx++ < a->used && ty-- >= 0) { ... }
  594. */
  595. iy := min(uint(src.used) - tx, ty + 1)
  596. /*
  597. Now for squaring, tx can never equal ty.
  598. We halve the distance since they approach at a rate of 2x,
  599. and we have to round because odd cases need to be executed.
  600. */
  601. iy = min(iy, ((ty - tx) + 1) >> 1 )
  602. /*
  603. Execute loop.
  604. */
  605. #no_bounds_check for iz := uint(0); iz < iy; iz += 1 {
  606. _W += _WORD(src.digit[tx + iz]) * _WORD(src.digit[ty - iz])
  607. }
  608. /*
  609. Double the inner product and add carry.
  610. */
  611. _W = _W + _W + W1
  612. /*
  613. Even columns have the square term in them.
  614. */
  615. if ix & 1 == 0 {
  616. _W += _WORD(src.digit[ix >> 1]) * _WORD(src.digit[ix >> 1])
  617. }
  618. /*
  619. Store it.
  620. */
  621. W[ix] = DIGIT(_W & _WORD(_MASK))
  622. /*
  623. Make next carry.
  624. */
  625. W1 = _W >> _DIGIT_BITS
  626. }
  627. /*
  628. Setup dest.
  629. */
  630. old_used := dest.used
  631. dest.used = src.used + src.used
  632. #no_bounds_check for ix = 0; ix < pa; ix += 1 {
  633. dest.digit[ix] = W[ix] & _MASK
  634. }
  635. /*
  636. Clear unused digits [that existed in the old copy of dest].
  637. */
  638. internal_zero_unused(dest, old_used)
  639. return internal_clamp(dest)
  640. }
  641. /*
  642. Karatsuba squaring, computes `dest` = `src` * `src` using three half-size squarings.
  643. See comments of `_private_int_mul_karatsuba` for details.
  644. It is essentially the same algorithm but merely tuned to perform recursive squarings.
  645. */
  646. _private_int_sqr_karatsuba :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
  647. context.allocator = allocator
  648. x0, x1, t1, t2, x0x0, x1x1 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
  649. defer internal_destroy(x0, x1, t1, t2, x0x0, x1x1)
  650. /*
  651. Min # of digits, divided by two.
  652. */
  653. B := src.used >> 1
  654. /*
  655. Init temps.
  656. */
  657. internal_grow(x0, B) or_return
  658. internal_grow(x1, src.used - B) or_return
  659. internal_grow(t1, src.used * 2) or_return
  660. internal_grow(t2, src.used * 2) or_return
  661. internal_grow(x0x0, B * 2 ) or_return
  662. internal_grow(x1x1, (src.used - B) * 2) or_return
  663. /*
  664. Now shift the digits.
  665. */
  666. x0.used = B
  667. x1.used = src.used - B
  668. #force_inline internal_copy_digits(x0, src, x0.used)
  669. #force_inline mem.copy_non_overlapping(&x1.digit[0], &src.digit[B], size_of(DIGIT) * x1.used)
  670. #force_inline internal_clamp(x0)
  671. /*
  672. Now calc the products x0*x0 and x1*x1.
  673. */
  674. internal_sqr(x0x0, x0) or_return
  675. internal_sqr(x1x1, x1) or_return
  676. /*
  677. Now calc (x1+x0)^2
  678. */
  679. internal_add(t1, x0, x1) or_return
  680. internal_sqr(t1, t1) or_return
  681. /*
  682. Add x0y0
  683. */
  684. internal_add(t2, x0x0, x1x1) or_return
  685. internal_sub(t1, t1, t2) or_return
  686. /*
  687. Shift by B.
  688. */
  689. _private_int_shl_leg(t1, B) or_return
  690. _private_int_shl_leg(x1x1, B * 2) or_return
  691. internal_add(t1, t1, x0x0) or_return
  692. internal_add(dest, t1, x1x1) or_return
  693. return #force_inline internal_clamp(dest)
  694. }
  695. /*
  696. Squaring using Toom-Cook 3-way algorithm.
  697. Setup and interpolation from algorithm SQR_3 in Chung, Jaewook, and M. Anwar Hasan. "Asymmetric squaring formulae."
  698. 18th IEEE Symposium on Computer Arithmetic (ARITH'07). IEEE, 2007.
  699. */
  700. _private_int_sqr_toom :: proc(dest, src: ^Int, allocator := context.allocator) -> (err: Error) {
  701. context.allocator = allocator
  702. S0, a0, a1, a2 := &Int{}, &Int{}, &Int{}, &Int{}
  703. defer internal_destroy(S0, a0, a1, a2)
  704. /*
  705. Init temps.
  706. */
  707. internal_zero(S0) or_return
  708. /*
  709. B
  710. */
  711. B := src.used / 3
  712. /*
  713. a = a2 * x^2 + a1 * x + a0;
  714. */
  715. internal_grow(a0, B) or_return
  716. internal_grow(a1, B) or_return
  717. internal_grow(a2, src.used - (2 * B)) or_return
  718. a0.used = B
  719. a1.used = B
  720. a2.used = src.used - 2 * B
  721. #force_inline mem.copy_non_overlapping(&a0.digit[0], &src.digit[ 0], size_of(DIGIT) * a0.used)
  722. #force_inline mem.copy_non_overlapping(&a1.digit[0], &src.digit[ B], size_of(DIGIT) * a1.used)
  723. #force_inline mem.copy_non_overlapping(&a2.digit[0], &src.digit[2 * B], size_of(DIGIT) * a2.used)
  724. internal_clamp(a0)
  725. internal_clamp(a1)
  726. internal_clamp(a2)
  727. /** S0 = a0^2; */
  728. internal_sqr(S0, a0) or_return
  729. /** \\S1 = (a2 + a1 + a0)^2 */
  730. /** \\S2 = (a2 - a1 + a0)^2 */
  731. /** \\S1 = a0 + a2; */
  732. /** a0 = a0 + a2; */
  733. internal_add(a0, a0, a2) or_return
  734. /** \\S2 = S1 - a1; */
  735. /** b = a0 - a1; */
  736. internal_sub(dest, a0, a1) or_return
  737. /** \\S1 = S1 + a1; */
  738. /** a0 = a0 + a1; */
  739. internal_add(a0, a0, a1) or_return
  740. /** \\S1 = S1^2; */
  741. /** a0 = a0^2; */
  742. internal_sqr(a0, a0) or_return
  743. /** \\S2 = S2^2; */
  744. /** b = b^2; */
  745. internal_sqr(dest, dest) or_return
  746. /** \\ S3 = 2 * a1 * a2 */
  747. /** \\S3 = a1 * a2; */
  748. /** a1 = a1 * a2; */
  749. internal_mul(a1, a1, a2) or_return
  750. /** \\S3 = S3 << 1; */
  751. /** a1 = a1 << 1; */
  752. internal_shl(a1, a1, 1) or_return
  753. /** \\S4 = a2^2; */
  754. /** a2 = a2^2; */
  755. internal_sqr(a2, a2) or_return
  756. /** \\ tmp = (S1 + S2)/2 */
  757. /** \\tmp = S1 + S2; */
  758. /** b = a0 + b; */
  759. internal_add(dest, a0, dest) or_return
  760. /** \\tmp = tmp >> 1; */
  761. /** b = b >> 1; */
  762. internal_shr(dest, dest, 1) or_return
  763. /** \\ S1 = S1 - tmp - S3 */
  764. /** \\S1 = S1 - tmp; */
  765. /** a0 = a0 - b; */
  766. internal_sub(a0, a0, dest) or_return
  767. /** \\S1 = S1 - S3; */
  768. /** a0 = a0 - a1; */
  769. internal_sub(a0, a0, a1) or_return
  770. /** \\S2 = tmp - S4 -S0 */
  771. /** \\S2 = tmp - S4; */
  772. /** b = b - a2; */
  773. internal_sub(dest, dest, a2) or_return
  774. /** \\S2 = S2 - S0; */
  775. /** b = b - S0; */
  776. internal_sub(dest, dest, S0) or_return
  777. /** \\P = S4*x^4 + S3*x^3 + S2*x^2 + S1*x + S0; */
  778. /** P = a2*x^4 + a1*x^3 + b*x^2 + a0*x + S0; */
  779. _private_int_shl_leg( a2, 4 * B) or_return
  780. _private_int_shl_leg( a1, 3 * B) or_return
  781. _private_int_shl_leg(dest, 2 * B) or_return
  782. _private_int_shl_leg( a0, 1 * B) or_return
  783. internal_add(a2, a2, a1) or_return
  784. internal_add(dest, dest, a2) or_return
  785. internal_add(dest, dest, a0) or_return
  786. internal_add(dest, dest, S0) or_return
  787. /** a^2 - P */
  788. return #force_inline internal_clamp(dest)
  789. }
  790. /*
  791. Divide by three (based on routine from MPI and the GMP manual).
  792. */
  793. _private_int_div_3 :: proc(quotient, numerator: ^Int, allocator := context.allocator) -> (remainder: DIGIT, err: Error) {
  794. context.allocator = allocator
  795. /*
  796. b = 2^_DIGIT_BITS / 3
  797. */
  798. b := _WORD(1) << _WORD(_DIGIT_BITS) / _WORD(3)
  799. q := &Int{}
  800. internal_grow(q, numerator.used) or_return
  801. q.used = numerator.used
  802. q.sign = numerator.sign
  803. w, t: _WORD
  804. #no_bounds_check for ix := numerator.used; ix >= 0; ix -= 1 {
  805. w = (w << _WORD(_DIGIT_BITS)) | _WORD(numerator.digit[ix])
  806. if w >= 3 {
  807. /*
  808. Multiply w by [1/3].
  809. */
  810. t = (w * b) >> _WORD(_DIGIT_BITS)
  811. /*
  812. Now subtract 3 * [w/3] from w, to get the remainder.
  813. */
  814. w -= t+t+t
  815. /*
  816. Fixup the remainder as required since the optimization is not exact.
  817. */
  818. for w >= 3 {
  819. t += 1
  820. w -= 3
  821. }
  822. } else {
  823. t = 0
  824. }
  825. q.digit[ix] = DIGIT(t)
  826. }
  827. remainder = DIGIT(w)
  828. /*
  829. [optional] store the quotient.
  830. */
  831. if quotient != nil {
  832. err = clamp(q)
  833. internal_swap(q, quotient)
  834. }
  835. internal_destroy(q)
  836. return remainder, nil
  837. }
  838. /*
  839. Signed Integer Division
  840. c*b + d == a [i.e. a/b, c=quotient, d=remainder], HAC pp.598 Algorithm 14.20
  841. Note that the description in HAC is horribly incomplete.
  842. For example, it doesn't consider the case where digits are removed from 'x' in
  843. the inner loop.
  844. It also doesn't consider the case that y has fewer than three digits, etc.
  845. The overall algorithm is as described as 14.20 from HAC but fixed to treat these cases.
  846. */
  847. _private_int_div_school :: proc(quotient, remainder, numerator, denominator: ^Int, allocator := context.allocator) -> (err: Error) {
  848. context.allocator = allocator
  849. error_if_immutable(quotient, remainder) or_return
  850. q, x, y, t1, t2 := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
  851. defer internal_destroy(q, x, y, t1, t2)
  852. internal_grow(q, numerator.used + 2) or_return
  853. q.used = numerator.used + 2
  854. internal_init_multi(t1, t2) or_return
  855. internal_copy(x, numerator) or_return
  856. internal_copy(y, denominator) or_return
  857. /*
  858. Fix the sign.
  859. */
  860. neg := numerator.sign != denominator.sign
  861. x.sign = .Zero_or_Positive
  862. y.sign = .Zero_or_Positive
  863. /*
  864. Normalize both x and y, ensure that y >= b/2, [b == 2**MP_DIGIT_BIT]
  865. */
  866. norm := internal_count_bits(y) % _DIGIT_BITS
  867. if norm < _DIGIT_BITS - 1 {
  868. norm = (_DIGIT_BITS - 1) - norm
  869. internal_shl(x, x, norm) or_return
  870. internal_shl(y, y, norm) or_return
  871. } else {
  872. norm = 0
  873. }
  874. /*
  875. Note: HAC does 0 based, so if used==5 then it's 0,1,2,3,4, i.e. use 4
  876. */
  877. n := x.used - 1
  878. t := y.used - 1
  879. /*
  880. while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} }
  881. y = y*b**{n-t}
  882. */
  883. _private_int_shl_leg(y, n - t) or_return
  884. gte := internal_gte(x, y)
  885. for gte {
  886. q.digit[n - t] += 1
  887. internal_sub(x, x, y) or_return
  888. gte = internal_gte(x, y)
  889. }
  890. /*
  891. Reset y by shifting it back down.
  892. */
  893. _private_int_shr_leg(y, n - t)
  894. /*
  895. Step 3. for i from n down to (t + 1).
  896. */
  897. #no_bounds_check for i := n; i >= (t + 1); i -= 1 {
  898. if (i > x.used) { continue }
  899. /*
  900. step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt
  901. */
  902. if x.digit[i] == y.digit[t] {
  903. q.digit[(i - t) - 1] = 1 << (_DIGIT_BITS - 1)
  904. } else {
  905. tmp := _WORD(x.digit[i]) << _DIGIT_BITS
  906. tmp |= _WORD(x.digit[i - 1])
  907. tmp /= _WORD(y.digit[t])
  908. if tmp > _WORD(_MASK) {
  909. tmp = _WORD(_MASK)
  910. }
  911. q.digit[(i - t) - 1] = DIGIT(tmp & _WORD(_MASK))
  912. }
  913. /* while (q{i-t-1} * (yt * b + y{t-1})) >
  914. xi * b**2 + xi-1 * b + xi-2
  915. do q{i-t-1} -= 1;
  916. */
  917. iter := 0
  918. q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] + 1) & _MASK
  919. #no_bounds_check for {
  920. q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] - 1) & _MASK
  921. /*
  922. Find left hand.
  923. */
  924. internal_zero(t1)
  925. t1.digit[0] = ((t - 1) < 0) ? 0 : y.digit[t - 1]
  926. t1.digit[1] = y.digit[t]
  927. t1.used = 2
  928. internal_mul(t1, t1, q.digit[(i - t) - 1]) or_return
  929. /*
  930. Find right hand.
  931. */
  932. t2.digit[0] = ((i - 2) < 0) ? 0 : x.digit[i - 2]
  933. t2.digit[1] = x.digit[i - 1] /* i >= 1 always holds */
  934. t2.digit[2] = x.digit[i]
  935. t2.used = 3
  936. if internal_lte(t1, t2) {
  937. break
  938. }
  939. iter += 1; if iter > 100 {
  940. return .Max_Iterations_Reached
  941. }
  942. }
  943. /*
  944. Step 3.3 x = x - q{i-t-1} * y * b**{i-t-1}
  945. */
  946. int_mul_digit(t1, y, q.digit[(i - t) - 1]) or_return
  947. _private_int_shl_leg(t1, (i - t) - 1) or_return
  948. internal_sub(x, x, t1) or_return
  949. /*
  950. if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; }
  951. */
  952. if x.sign == .Negative {
  953. internal_copy(t1, y) or_return
  954. _private_int_shl_leg(t1, (i - t) - 1) or_return
  955. internal_add(x, x, t1) or_return
  956. q.digit[(i - t) - 1] = (q.digit[(i - t) - 1] - 1) & _MASK
  957. }
  958. }
  959. /*
  960. Now q is the quotient and x is the remainder, [which we have to normalize]
  961. Get sign before writing to c.
  962. */
  963. z, _ := is_zero(x)
  964. x.sign = .Zero_or_Positive if z else numerator.sign
  965. if quotient != nil {
  966. internal_clamp(q)
  967. internal_swap(q, quotient)
  968. quotient.sign = .Negative if neg else .Zero_or_Positive
  969. }
  970. if remainder != nil {
  971. internal_shr(x, x, norm) or_return
  972. internal_swap(x, remainder)
  973. }
  974. return nil
  975. }
  976. /*
  977. Direct implementation of algorithms 1.8 "RecursiveDivRem" and 1.9 "UnbalancedDivision" from:
  978. Brent, Richard P., and Paul Zimmermann. "Modern computer arithmetic"
  979. Vol. 18. Cambridge University Press, 2010
  980. Available online at https://arxiv.org/pdf/1004.4710
  981. pages 19ff. in the above online document.
  982. */
  983. _private_div_recursion :: proc(quotient, remainder, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
  984. context.allocator = allocator
  985. A1, A2, B1, B0, Q1, Q0, R1, R0, t := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
  986. defer internal_destroy(A1, A2, B1, B0, Q1, Q0, R1, R0, t)
  987. m := a.used - b.used
  988. k := m / 2
  989. if m < MUL_KARATSUBA_CUTOFF {
  990. return _private_int_div_school(quotient, remainder, a, b)
  991. }
  992. internal_init_multi(A1, A2, B1, B0, Q1, Q0, R1, R0, t) or_return
  993. /*
  994. `B1` = `b` / `beta`^`k`, `B0` = `b` % `beta`^`k`
  995. */
  996. internal_shrmod(B1, B0, b, k * _DIGIT_BITS) or_return
  997. /*
  998. (Q1, R1) = RecursiveDivRem(A / beta^(2k), B1)
  999. */
  1000. internal_shrmod(A1, t, a, 2 * k * _DIGIT_BITS) or_return
  1001. _private_div_recursion(Q1, R1, A1, B1) or_return
  1002. /*
  1003. A1 = (R1 * beta^(2k)) + (A % beta^(2k)) - (Q1 * B0 * beta^k)
  1004. */
  1005. _private_int_shl_leg(R1, 2 * k) or_return
  1006. internal_add(A1, R1, t) or_return
  1007. internal_mul(t, Q1, B0) or_return
  1008. /*
  1009. While A1 < 0 do Q1 = Q1 - 1, A1 = A1 + (beta^k * B)
  1010. */
  1011. if internal_lt(A1, 0) {
  1012. internal_shl(t, b, k * _DIGIT_BITS) or_return
  1013. for {
  1014. internal_decr(Q1) or_return
  1015. internal_add(A1, A1, t) or_return
  1016. if internal_gte(A1, 0) { break }
  1017. }
  1018. }
  1019. /*
  1020. (Q0, R0) = RecursiveDivRem(A1 / beta^(k), B1)
  1021. */
  1022. internal_shrmod(A1, t, A1, k * _DIGIT_BITS) or_return
  1023. _private_div_recursion(Q0, R0, A1, B1) or_return
  1024. /*
  1025. A2 = (R0*beta^k) + (A1 % beta^k) - (Q0*B0)
  1026. */
  1027. _private_int_shl_leg(R0, k) or_return
  1028. internal_add(A2, R0, t) or_return
  1029. internal_mul(t, Q0, B0) or_return
  1030. internal_sub(A2, A2, t) or_return
  1031. /*
  1032. While A2 < 0 do Q0 = Q0 - 1, A2 = A2 + B.
  1033. */
  1034. for internal_is_negative(A2) { // internal_lt(A2, 0) {
  1035. internal_decr(Q0) or_return
  1036. internal_add(A2, A2, b) or_return
  1037. }
  1038. /*
  1039. Return q = (Q1*beta^k) + Q0, r = A2.
  1040. */
  1041. _private_int_shl_leg(Q1, k) or_return
  1042. internal_add(quotient, Q1, Q0) or_return
  1043. return internal_copy(remainder, A2)
  1044. }
  1045. _private_int_div_recursive :: proc(quotient, remainder, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
  1046. context.allocator = allocator
  1047. A, B, Q, Q1, R, A_div, A_mod := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
  1048. defer internal_destroy(A, B, Q, Q1, R, A_div, A_mod)
  1049. internal_init_multi(A, B, Q, Q1, R, A_div, A_mod) or_return
  1050. /*
  1051. Most significant bit of a limb.
  1052. Assumes _DIGIT_MAX < (sizeof(DIGIT) * sizeof(u8)).
  1053. */
  1054. msb := (_DIGIT_MAX + DIGIT(1)) >> 1
  1055. sigma := 0
  1056. msb_b := b.digit[b.used - 1]
  1057. for msb_b < msb {
  1058. sigma += 1
  1059. msb_b <<= 1
  1060. }
  1061. /*
  1062. Use that sigma to normalize B.
  1063. */
  1064. internal_shl(B, b, sigma) or_return
  1065. internal_shl(A, a, sigma) or_return
  1066. /*
  1067. Fix the sign.
  1068. */
  1069. neg := a.sign != b.sign
  1070. A.sign = .Zero_or_Positive; B.sign = .Zero_or_Positive
  1071. /*
  1072. If the magnitude of "A" is not more more than twice that of "B" we can work
  1073. on them directly, otherwise we need to work at "A" in chunks.
  1074. */
  1075. n := B.used
  1076. m := A.used - B.used
  1077. /*
  1078. Q = 0. We already ensured that when we called `internal_init_multi`.
  1079. */
  1080. for m > n {
  1081. /*
  1082. (q, r) = RecursiveDivRem(A / (beta^(m-n)), B)
  1083. */
  1084. j := (m - n) * _DIGIT_BITS
  1085. internal_shrmod(A_div, A_mod, A, j) or_return
  1086. _private_div_recursion(Q1, R, A_div, B) or_return
  1087. /*
  1088. Q = (Q*beta!(n)) + q
  1089. */
  1090. internal_shl(Q, Q, n * _DIGIT_BITS) or_return
  1091. internal_add(Q, Q, Q1) or_return
  1092. /*
  1093. A = (r * beta^(m-n)) + (A % beta^(m-n))
  1094. */
  1095. internal_shl(R, R, (m - n) * _DIGIT_BITS) or_return
  1096. internal_add(A, R, A_mod) or_return
  1097. /*
  1098. m = m - n
  1099. */
  1100. m -= n
  1101. }
  1102. /*
  1103. (q, r) = RecursiveDivRem(A, B)
  1104. */
  1105. _private_div_recursion(Q1, R, A, B) or_return
  1106. /*
  1107. Q = (Q * beta^m) + q, R = r
  1108. */
  1109. internal_shl(Q, Q, m * _DIGIT_BITS) or_return
  1110. internal_add(Q, Q, Q1) or_return
  1111. /*
  1112. Get sign before writing to dest.
  1113. */
  1114. R.sign = .Zero_or_Positive if internal_is_zero(Q) else a.sign
  1115. if quotient != nil {
  1116. swap(quotient, Q)
  1117. quotient.sign = .Negative if neg else .Zero_or_Positive
  1118. }
  1119. if remainder != nil {
  1120. /*
  1121. De-normalize the remainder.
  1122. */
  1123. internal_shrmod(R, nil, R, sigma) or_return
  1124. swap(remainder, R)
  1125. }
  1126. return nil
  1127. }
  1128. /*
  1129. Slower bit-bang division... also smaller.
  1130. */
  1131. @(deprecated="Use `_int_div_school`, it's 3.5x faster.")
  1132. _private_int_div_small :: proc(quotient, remainder, numerator, denominator: ^Int) -> (err: Error) {
  1133. ta, tb, tq, q := &Int{}, &Int{}, &Int{}, &Int{}
  1134. defer internal_destroy(ta, tb, tq, q)
  1135. for {
  1136. internal_one(tq) or_return
  1137. num_bits, _ := count_bits(numerator)
  1138. den_bits, _ := count_bits(denominator)
  1139. n := num_bits - den_bits
  1140. abs(ta, numerator) or_return
  1141. abs(tb, denominator) or_return
  1142. shl(tb, tb, n) or_return
  1143. shl(tq, tq, n) or_return
  1144. for n >= 0 {
  1145. if internal_gte(ta, tb) {
  1146. // ta -= tb
  1147. sub(ta, ta, tb) or_return
  1148. // q += tq
  1149. add( q, q, tq) or_return
  1150. }
  1151. shr1(tb, tb) or_return
  1152. shr1(tq, tq) or_return
  1153. n -= 1
  1154. }
  1155. /*
  1156. Now q == quotient and ta == remainder.
  1157. */
  1158. neg := numerator.sign != denominator.sign
  1159. if quotient != nil {
  1160. swap(quotient, q)
  1161. z, _ := is_zero(quotient)
  1162. quotient.sign = .Negative if neg && !z else .Zero_or_Positive
  1163. }
  1164. if remainder != nil {
  1165. swap(remainder, ta)
  1166. z, _ := is_zero(numerator)
  1167. remainder.sign = .Zero_or_Positive if z else numerator.sign
  1168. }
  1169. break
  1170. }
  1171. return err
  1172. }
  1173. /*
  1174. Binary split factorial algo due to: http://www.luschny.de/math/factorial/binarysplitfact.html
  1175. */
  1176. _private_int_factorial_binary_split :: proc(res: ^Int, n: int, allocator := context.allocator) -> (err: Error) {
  1177. context.allocator = allocator
  1178. inner, outer, start, stop, temp := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
  1179. defer internal_destroy(inner, outer, start, stop, temp)
  1180. internal_one(inner, false) or_return
  1181. internal_one(outer, false) or_return
  1182. bits_used := ilog2(n)
  1183. for i := bits_used; i >= 0; i -= 1 {
  1184. start := (n >> (uint(i) + 1)) + 1 | 1
  1185. stop := (n >> uint(i)) + 1 | 1
  1186. _private_int_recursive_product(temp, start, stop, 0) or_return
  1187. internal_mul(inner, inner, temp) or_return
  1188. internal_mul(outer, outer, inner) or_return
  1189. }
  1190. shift := n - intrinsics.count_ones(n)
  1191. return internal_shl(res, outer, int(shift))
  1192. }
  1193. /*
  1194. Recursive product used by binary split factorial algorithm.
  1195. */
  1196. _private_int_recursive_product :: proc(res: ^Int, start, stop: int, level := int(0), allocator := context.allocator) -> (err: Error) {
  1197. context.allocator = allocator
  1198. t1, t2 := &Int{}, &Int{}
  1199. defer internal_destroy(t1, t2)
  1200. if level > FACTORIAL_BINARY_SPLIT_MAX_RECURSIONS {
  1201. return .Max_Iterations_Reached
  1202. }
  1203. num_factors := (stop - start) >> 1
  1204. if num_factors == 2 {
  1205. internal_set(t1, start, false) or_return
  1206. when true {
  1207. internal_grow(t2, t1.used + 1, false) or_return
  1208. internal_add(t2, t1, 2) or_return
  1209. } else {
  1210. internal_add(t2, t1, 2) or_return
  1211. }
  1212. return internal_mul(res, t1, t2)
  1213. }
  1214. if num_factors > 1 {
  1215. mid := (start + num_factors) | 1
  1216. _private_int_recursive_product(t1, start, mid, level + 1) or_return
  1217. _private_int_recursive_product(t2, mid, stop, level + 1) or_return
  1218. return internal_mul(res, t1, t2)
  1219. }
  1220. if num_factors == 1 {
  1221. return #force_inline internal_set(res, start, true)
  1222. }
  1223. return #force_inline internal_one(res, true)
  1224. }
  1225. /*
  1226. Internal function computing both GCD using the binary method,
  1227. and, if target isn't `nil`, also LCM.
  1228. Expects the `a` and `b` to have been initialized
  1229. and one or both of `res_gcd` or `res_lcm` not to be `nil`.
  1230. If both `a` and `b` are zero, return zero.
  1231. If either `a` or `b`, return the other one.
  1232. The `gcd` and `lcm` wrappers have already done this test,
  1233. but `gcd_lcm` wouldn't have, so we still need to perform it.
  1234. If neither result is wanted, we have nothing to do.
  1235. */
  1236. _private_int_gcd_lcm :: proc(res_gcd, res_lcm, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
  1237. context.allocator = allocator
  1238. if res_gcd == nil && res_lcm == nil {
  1239. return nil
  1240. }
  1241. /*
  1242. We need a temporary because `res_gcd` is allowed to be `nil`.
  1243. */
  1244. if a.used == 0 && b.used == 0 {
  1245. /*
  1246. GCD(0, 0) and LCM(0, 0) are both 0.
  1247. */
  1248. if res_gcd != nil {
  1249. internal_zero(res_gcd) or_return
  1250. }
  1251. if res_lcm != nil {
  1252. internal_zero(res_lcm) or_return
  1253. }
  1254. return nil
  1255. } else if a.used == 0 {
  1256. /*
  1257. We can early out with GCD = B and LCM = 0
  1258. */
  1259. if res_gcd != nil {
  1260. internal_abs(res_gcd, b) or_return
  1261. }
  1262. if res_lcm != nil {
  1263. internal_zero(res_lcm) or_return
  1264. }
  1265. return nil
  1266. } else if b.used == 0 {
  1267. /*
  1268. We can early out with GCD = A and LCM = 0
  1269. */
  1270. if res_gcd != nil {
  1271. internal_abs(res_gcd, a) or_return
  1272. }
  1273. if res_lcm != nil {
  1274. internal_zero(res_lcm) or_return
  1275. }
  1276. return nil
  1277. }
  1278. temp_gcd_res := &Int{}
  1279. defer internal_destroy(temp_gcd_res)
  1280. /*
  1281. If neither `a` or `b` was zero, we need to compute `gcd`.
  1282. Get copies of `a` and `b` we can modify.
  1283. */
  1284. u, v := &Int{}, &Int{}
  1285. defer internal_destroy(u, v)
  1286. internal_copy(u, a) or_return
  1287. internal_copy(v, b) or_return
  1288. /*
  1289. Must be positive for the remainder of the algorithm.
  1290. */
  1291. u.sign = .Zero_or_Positive; v.sign = .Zero_or_Positive
  1292. /*
  1293. B1. Find the common power of two for `u` and `v`.
  1294. */
  1295. u_lsb, _ := internal_count_lsb(u)
  1296. v_lsb, _ := internal_count_lsb(v)
  1297. k := min(u_lsb, v_lsb)
  1298. if k > 0 {
  1299. /*
  1300. Divide the power of two out.
  1301. */
  1302. internal_shr(u, u, k) or_return
  1303. internal_shr(v, v, k) or_return
  1304. }
  1305. /*
  1306. Divide any remaining factors of two out.
  1307. */
  1308. if u_lsb != k {
  1309. internal_shr(u, u, u_lsb - k) or_return
  1310. }
  1311. if v_lsb != k {
  1312. internal_shr(v, v, v_lsb - k) or_return
  1313. }
  1314. for v.used != 0 {
  1315. /*
  1316. Make sure `v` is the largest.
  1317. */
  1318. if internal_gt(u, v) {
  1319. /*
  1320. Swap `u` and `v` to make sure `v` is >= `u`.
  1321. */
  1322. internal_swap(u, v)
  1323. }
  1324. /*
  1325. Subtract smallest from largest.
  1326. */
  1327. internal_sub(v, v, u) or_return
  1328. /*
  1329. Divide out all factors of two.
  1330. */
  1331. b, _ := internal_count_lsb(v)
  1332. internal_shr(v, v, b) or_return
  1333. }
  1334. /*
  1335. Multiply by 2**k which we divided out at the beginning.
  1336. */
  1337. internal_shl(temp_gcd_res, u, k) or_return
  1338. temp_gcd_res.sign = .Zero_or_Positive
  1339. /*
  1340. We've computed `gcd`, either the long way, or because one of the inputs was zero.
  1341. If we don't want `lcm`, we're done.
  1342. */
  1343. if res_lcm == nil {
  1344. internal_swap(temp_gcd_res, res_gcd)
  1345. return nil
  1346. }
  1347. /*
  1348. Computes least common multiple as `|a*b|/gcd(a,b)`
  1349. Divide the smallest by the GCD.
  1350. */
  1351. if internal_lt_abs(a, b) {
  1352. /*
  1353. Store quotient in `t2` such that `t2 * b` is the LCM.
  1354. */
  1355. internal_div(res_lcm, a, temp_gcd_res) or_return
  1356. err = internal_mul(res_lcm, res_lcm, b)
  1357. } else {
  1358. /*
  1359. Store quotient in `t2` such that `t2 * a` is the LCM.
  1360. */
  1361. internal_div(res_lcm, b, temp_gcd_res) or_return
  1362. err = internal_mul(res_lcm, res_lcm, a)
  1363. }
  1364. if res_gcd != nil {
  1365. internal_swap(temp_gcd_res, res_gcd)
  1366. }
  1367. /*
  1368. Fix the sign to positive and return.
  1369. */
  1370. res_lcm.sign = .Zero_or_Positive
  1371. return err
  1372. }
  1373. /*
  1374. Internal implementation of log.
  1375. Assumes `a` not to be `nil` and to have been initialized.
  1376. */
  1377. _private_int_log :: proc(a: ^Int, base: DIGIT, allocator := context.allocator) -> (res: int, err: Error) {
  1378. bracket_low, bracket_high, bracket_mid, t, bi_base := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
  1379. defer internal_destroy(bracket_low, bracket_high, bracket_mid, t, bi_base)
  1380. ic := #force_inline internal_cmp(a, base)
  1381. if ic == -1 || ic == 0 {
  1382. return 1 if ic == 0 else 0, nil
  1383. }
  1384. defer if err != nil {
  1385. res = -1
  1386. }
  1387. internal_set(bi_base, base, true, allocator) or_return
  1388. internal_clear(bracket_mid, false, allocator) or_return
  1389. internal_clear(t, false, allocator) or_return
  1390. internal_one(bracket_low, false, allocator) or_return
  1391. internal_set(bracket_high, base, false, allocator) or_return
  1392. low := 0; high := 1
  1393. /*
  1394. A kind of Giant-step/baby-step algorithm.
  1395. Idea shamelessly stolen from https://programmingpraxis.com/2010/05/07/integer-logarithms/2/
  1396. The effect is asymptotic, hence needs benchmarks to test if the Giant-step should be skipped
  1397. for small n.
  1398. */
  1399. for {
  1400. /*
  1401. Iterate until `a` is bracketed between low + high.
  1402. */
  1403. if #force_inline internal_gte(bracket_high, a) { break }
  1404. low = high
  1405. #force_inline internal_copy(bracket_low, bracket_high) or_return
  1406. high <<= 1
  1407. #force_inline internal_sqr(bracket_high, bracket_high) or_return
  1408. }
  1409. for (high - low) > 1 {
  1410. mid := (high + low) >> 1
  1411. #force_inline internal_pow(t, bi_base, mid - low) or_return
  1412. #force_inline internal_mul(bracket_mid, bracket_low, t) or_return
  1413. mc := #force_inline internal_cmp(a, bracket_mid)
  1414. switch mc {
  1415. case -1:
  1416. high = mid
  1417. internal_swap(bracket_mid, bracket_high)
  1418. case 0:
  1419. return mid, nil
  1420. case 1:
  1421. low = mid
  1422. internal_swap(bracket_mid, bracket_low)
  1423. }
  1424. }
  1425. fc := #force_inline internal_cmp(bracket_high, a)
  1426. res = high if fc == 0 else low
  1427. return
  1428. }
  1429. /*
  1430. Computes xR**-1 == x (mod N) via Montgomery Reduction.
  1431. This is an optimized implementation of `internal_montgomery_reduce`
  1432. which uses the comba method to quickly calculate the columns of the reduction.
  1433. Based on Algorithm 14.32 on pp.601 of HAC.
  1434. */
  1435. _private_montgomery_reduce_comba :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
  1436. context.allocator = allocator
  1437. W: [_WARRAY]_WORD = ---
  1438. if x.used > _WARRAY { return .Invalid_Argument }
  1439. /*
  1440. Get old used count.
  1441. */
  1442. old_used := x.used
  1443. /*
  1444. Grow `x` as required.
  1445. */
  1446. internal_grow(x, n.used + 1) or_return
  1447. /*
  1448. First we have to get the digits of the input into an array of double precision words W[...]
  1449. Copy the digits of `x` into W[0..`x.used` - 1]
  1450. */
  1451. ix: int
  1452. for ix = 0; ix < x.used; ix += 1 {
  1453. W[ix] = _WORD(x.digit[ix])
  1454. }
  1455. /*
  1456. Zero the high words of W[a->used..m->used*2].
  1457. */
  1458. zero_upper := (n.used * 2) + 1
  1459. if ix < zero_upper {
  1460. for ix = x.used; ix < zero_upper; ix += 1 {
  1461. W[ix] = {}
  1462. }
  1463. }
  1464. /*
  1465. Now we proceed to zero successive digits from the least significant upwards.
  1466. */
  1467. for ix = 0; ix < n.used; ix += 1 {
  1468. /*
  1469. `mu = ai * m' mod b`
  1470. We avoid a double precision multiplication (which isn't required)
  1471. by casting the value down to a DIGIT. Note this requires
  1472. that W[ix-1] have the carry cleared (see after the inner loop)
  1473. */
  1474. mu := ((W[ix] & _WORD(_MASK)) * _WORD(rho)) & _WORD(_MASK)
  1475. /*
  1476. `a = a + mu * m * b**i`
  1477. This is computed in place and on the fly. The multiplication
  1478. by b**i is handled by offseting which columns the results
  1479. are added to.
  1480. Note the comba method normally doesn't handle carries in the
  1481. inner loop In this case we fix the carry from the previous
  1482. column since the Montgomery reduction requires digits of the
  1483. result (so far) [see above] to work.
  1484. This is handled by fixing up one carry after the inner loop.
  1485. The carry fixups are done in order so after these loops the
  1486. first m->used words of W[] have the carries fixed.
  1487. */
  1488. for iy := 0; iy < n.used; iy += 1 {
  1489. W[ix + iy] += mu * _WORD(n.digit[iy])
  1490. }
  1491. /*
  1492. Now fix carry for next digit, W[ix+1].
  1493. */
  1494. W[ix + 1] += (W[ix] >> _DIGIT_BITS)
  1495. }
  1496. /*
  1497. Now we have to propagate the carries and shift the words downward
  1498. [all those least significant digits we zeroed].
  1499. */
  1500. for ; ix < n.used * 2; ix += 1 {
  1501. W[ix + 1] += (W[ix] >> _DIGIT_BITS)
  1502. }
  1503. /* copy out, A = A/b**n
  1504. *
  1505. * The result is A/b**n but instead of converting from an
  1506. * array of mp_word to mp_digit than calling mp_rshd
  1507. * we just copy them in the right order
  1508. */
  1509. for ix = 0; ix < (n.used + 1); ix += 1 {
  1510. x.digit[ix] = DIGIT(W[n.used + ix] & _WORD(_MASK))
  1511. }
  1512. /*
  1513. Set the max used.
  1514. */
  1515. x.used = n.used + 1
  1516. /*
  1517. Zero old_used digits, if the input a was larger than m->used+1 we'll have to clear the digits.
  1518. */
  1519. internal_zero_unused(x, old_used)
  1520. internal_clamp(x)
  1521. /*
  1522. if A >= m then A = A - m
  1523. */
  1524. if internal_gte_abs(x, n) {
  1525. return internal_sub(x, x, n)
  1526. }
  1527. return nil
  1528. }
  1529. /*
  1530. Computes xR**-1 == x (mod N) via Montgomery Reduction.
  1531. Assumes `x` and `n` not to be nil.
  1532. */
  1533. _private_int_montgomery_reduce :: proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error) {
  1534. context.allocator = allocator
  1535. /*
  1536. Can the fast reduction [comba] method be used?
  1537. Note that unlike in mul, you're safely allowed *less* than the available columns [255 per default],
  1538. since carries are fixed up in the inner loop.
  1539. */
  1540. internal_clear_if_uninitialized(x, n) or_return
  1541. digs := (n.used * 2) + 1
  1542. if digs < _WARRAY && x.used <= _WARRAY && n.used < _MAX_COMBA {
  1543. return _private_montgomery_reduce_comba(x, n, rho)
  1544. }
  1545. /*
  1546. Grow the input as required
  1547. */
  1548. internal_grow(x, digs) or_return
  1549. x.used = digs
  1550. for ix := 0; ix < n.used; ix += 1 {
  1551. /*
  1552. `mu = ai * rho mod b`
  1553. The value of rho must be precalculated via `int_montgomery_setup()`,
  1554. such that it equals -1/n0 mod b this allows the following inner loop
  1555. to reduce the input one digit at a time.
  1556. */
  1557. mu := DIGIT((_WORD(x.digit[ix]) * _WORD(rho)) & _WORD(_MASK))
  1558. /*
  1559. a = a + mu * m * b**i
  1560. Multiply and add in place.
  1561. */
  1562. u := DIGIT(0)
  1563. iy := int(0)
  1564. for ; iy < n.used; iy += 1 {
  1565. /*
  1566. Compute product and sum.
  1567. */
  1568. r := (_WORD(mu) * _WORD(n.digit[iy]) + _WORD(u) + _WORD(x.digit[ix + iy]))
  1569. /*
  1570. Get carry.
  1571. */
  1572. u = DIGIT(r >> _DIGIT_BITS)
  1573. /*
  1574. Fix digit.
  1575. */
  1576. x.digit[ix + iy] = DIGIT(r & _WORD(_MASK))
  1577. }
  1578. /*
  1579. At this point the ix'th digit of x should be zero.
  1580. Propagate carries upwards as required.
  1581. */
  1582. for u != 0 {
  1583. x.digit[ix + iy] += u
  1584. u = x.digit[ix + iy] >> _DIGIT_BITS
  1585. x.digit[ix + iy] &= _MASK
  1586. iy += 1
  1587. }
  1588. }
  1589. /*
  1590. At this point the n.used'th least significant digits of x are all zero,
  1591. which means we can shift x to the right by n.used digits and the
  1592. residue is unchanged.
  1593. x = x/b**n.used.
  1594. */
  1595. internal_clamp(x)
  1596. _private_int_shr_leg(x, n.used)
  1597. /*
  1598. if x >= n then x = x - n
  1599. */
  1600. if internal_gte_abs(x, n) {
  1601. return internal_sub(x, x, n)
  1602. }
  1603. return nil
  1604. }
  1605. /*
  1606. Shifts with subtractions when the result is greater than b.
  1607. The method is slightly modified to shift B unconditionally upto just under
  1608. the leading bit of b. This saves alot of multiple precision shifting.
  1609. Assumes `a` and `b` not to be `nil`.
  1610. */
  1611. _private_int_montgomery_calc_normalization :: proc(a, b: ^Int, allocator := context.allocator) -> (err: Error) {
  1612. context.allocator = allocator
  1613. /*
  1614. How many bits of last digit does b use.
  1615. */
  1616. internal_clear_if_uninitialized(a, b) or_return
  1617. bits := internal_count_bits(b) % _DIGIT_BITS
  1618. if b.used > 1 {
  1619. power := ((b.used - 1) * _DIGIT_BITS) + bits - 1
  1620. internal_int_power_of_two(a, power) or_return
  1621. } else {
  1622. internal_one(a) or_return
  1623. bits = 1
  1624. }
  1625. /*
  1626. Now compute C = A * B mod b.
  1627. */
  1628. for x := bits - 1; x < _DIGIT_BITS; x += 1 {
  1629. internal_int_shl1(a, a) or_return
  1630. if internal_gte_abs(a, b) {
  1631. internal_sub(a, a, b) or_return
  1632. }
  1633. }
  1634. return nil
  1635. }
  1636. /*
  1637. Sets up the Montgomery reduction stuff.
  1638. */
  1639. _private_int_montgomery_setup :: proc(n: ^Int, allocator := context.allocator) -> (rho: DIGIT, err: Error) {
  1640. /*
  1641. Fast inversion mod 2**k
  1642. Based on the fact that:
  1643. XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n)
  1644. => 2*X*A - X*X*A*A = 1
  1645. => 2*(1) - (1) = 1
  1646. */
  1647. internal_clear_if_uninitialized(n, allocator) or_return
  1648. b := n.digit[0]
  1649. if b & 1 == 0 { return 0, .Invalid_Argument }
  1650. x := (((b + 2) & 4) << 1) + b /* here x*a==1 mod 2**4 */
  1651. x *= 2 - (b * x) /* here x*a==1 mod 2**8 */
  1652. x *= 2 - (b * x) /* here x*a==1 mod 2**16 */
  1653. when _DIGIT_TYPE_BITS == 64 {
  1654. x *= 2 - (b * x) /* here x*a==1 mod 2**32 */
  1655. x *= 2 - (b * x) /* here x*a==1 mod 2**64 */
  1656. }
  1657. /*
  1658. rho = -1/m mod b
  1659. */
  1660. rho = DIGIT(((_WORD(1) << _WORD(_DIGIT_BITS)) - _WORD(x)) & _WORD(_MASK))
  1661. return rho, nil
  1662. }
  1663. /*
  1664. Reduces `x` mod `m`, assumes 0 < x < m**2, mu is precomputed via reduce_setup.
  1665. From HAC pp.604 Algorithm 14.42
  1666. Assumes `x`, `m` and `mu` all not to be `nil` and have been initialized.
  1667. */
  1668. _private_int_reduce :: proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error) {
  1669. context.allocator = allocator
  1670. q := &Int{}
  1671. defer internal_destroy(q)
  1672. um := m.used
  1673. /*
  1674. q = x
  1675. */
  1676. internal_copy(q, x) or_return
  1677. /*
  1678. q1 = x / b**(k-1)
  1679. */
  1680. _private_int_shr_leg(q, um - 1)
  1681. /*
  1682. According to HAC this optimization is ok.
  1683. */
  1684. if DIGIT(um) > DIGIT(1) << (_DIGIT_BITS - 1) {
  1685. internal_mul(q, q, mu) or_return
  1686. } else {
  1687. _private_int_mul_high(q, q, mu, um) or_return
  1688. }
  1689. /*
  1690. q3 = q2 / b**(k+1)
  1691. */
  1692. _private_int_shr_leg(q, um + 1)
  1693. /*
  1694. x = x mod b**(k+1), quick (no division)
  1695. */
  1696. internal_int_mod_bits(x, x, _DIGIT_BITS * (um + 1)) or_return
  1697. /*
  1698. q = q * m mod b**(k+1), quick (no division)
  1699. */
  1700. _private_int_mul(q, q, m, um + 1) or_return
  1701. /*
  1702. x = x - q
  1703. */
  1704. internal_sub(x, x, q) or_return
  1705. /*
  1706. If x < 0, add b**(k+1) to it.
  1707. */
  1708. if internal_is_negative(x) {
  1709. internal_set(q, 1) or_return
  1710. _private_int_shl_leg(q, um + 1) or_return
  1711. internal_add(x, x, q) or_return
  1712. }
  1713. /*
  1714. Back off if it's too big.
  1715. */
  1716. for internal_gte(x, m) {
  1717. internal_sub(x, x, m) or_return
  1718. }
  1719. return nil
  1720. }
  1721. /*
  1722. Reduces `a` modulo `n`, where `n` is of the form 2**p - d.
  1723. */
  1724. _private_int_reduce_2k :: proc(a, n: ^Int, d: DIGIT, allocator := context.allocator) -> (err: Error) {
  1725. context.allocator = allocator
  1726. q := &Int{}
  1727. defer internal_destroy(q)
  1728. internal_zero(q) or_return
  1729. p := internal_count_bits(n)
  1730. for {
  1731. /*
  1732. q = a/2**p, a = a mod 2**p
  1733. */
  1734. internal_shrmod(q, a, a, p) or_return
  1735. if d != 1 {
  1736. /*
  1737. q = q * d
  1738. */
  1739. internal_mul(q, q, d) or_return
  1740. }
  1741. /*
  1742. a = a + q
  1743. */
  1744. internal_add(a, a, q) or_return
  1745. if internal_lt_abs(a, n) { break }
  1746. internal_sub(a, a, n) or_return
  1747. }
  1748. return nil
  1749. }
  1750. /*
  1751. Reduces `a` modulo `n` where `n` is of the form 2**p - d
  1752. This differs from reduce_2k since "d" can be larger than a single digit.
  1753. */
  1754. _private_int_reduce_2k_l :: proc(a, n, d: ^Int, allocator := context.allocator) -> (err: Error) {
  1755. context.allocator = allocator
  1756. q := &Int{}
  1757. defer internal_destroy(q)
  1758. internal_zero(q) or_return
  1759. p := internal_count_bits(n)
  1760. for {
  1761. /*
  1762. q = a/2**p, a = a mod 2**p
  1763. */
  1764. internal_shrmod(q, a, a, p) or_return
  1765. /*
  1766. q = q * d
  1767. */
  1768. internal_mul(q, q, d) or_return
  1769. /*
  1770. a = a + q
  1771. */
  1772. internal_add(a, a, q) or_return
  1773. if internal_lt_abs(a, n) { break }
  1774. internal_sub(a, a, n) or_return
  1775. }
  1776. return nil
  1777. }
  1778. /*
  1779. Determines if `internal_int_reduce_2k` can be used.
  1780. Asssumes `a` not to be `nil` and to have been initialized.
  1781. */
  1782. _private_int_reduce_is_2k :: proc(a: ^Int) -> (reducible: bool, err: Error) {
  1783. assert_if_nil(a)
  1784. if internal_is_zero(a) {
  1785. return false, nil
  1786. } else if a.used == 1 {
  1787. return true, nil
  1788. } else if a.used > 1 {
  1789. iy := internal_count_bits(a)
  1790. iw := 1
  1791. iz := DIGIT(1)
  1792. /*
  1793. Test every bit from the second digit up, must be 1.
  1794. */
  1795. for ix := _DIGIT_BITS; ix < iy; ix += 1 {
  1796. if a.digit[iw] & iz == 0 {
  1797. return false, nil
  1798. }
  1799. iz <<= 1
  1800. if iz > _DIGIT_MAX {
  1801. iw += 1
  1802. iz = 1
  1803. }
  1804. }
  1805. return true, nil
  1806. } else {
  1807. return true, nil
  1808. }
  1809. }
  1810. /*
  1811. Determines if `internal_int_reduce_2k_l` can be used.
  1812. Asssumes `a` not to be `nil` and to have been initialized.
  1813. */
  1814. _private_int_reduce_is_2k_l :: proc(a: ^Int) -> (reducible: bool, err: Error) {
  1815. assert_if_nil(a)
  1816. if internal_int_is_zero(a) {
  1817. return false, nil
  1818. } else if a.used == 1 {
  1819. return true, nil
  1820. } else if a.used > 1 {
  1821. /*
  1822. If more than half of the digits are -1 we're sold.
  1823. */
  1824. ix := 0
  1825. iy := 0
  1826. for ; ix < a.used; ix += 1 {
  1827. if a.digit[ix] == _DIGIT_MAX {
  1828. iy += 1
  1829. }
  1830. }
  1831. return iy >= (a.used / 2), nil
  1832. } else {
  1833. return false, nil
  1834. }
  1835. }
  1836. /*
  1837. Determines the setup value.
  1838. Assumes `a` is not `nil`.
  1839. */
  1840. _private_int_reduce_2k_setup :: proc(a: ^Int, allocator := context.allocator) -> (d: DIGIT, err: Error) {
  1841. context.allocator = allocator
  1842. tmp := &Int{}
  1843. defer internal_destroy(tmp)
  1844. internal_zero(tmp) or_return
  1845. internal_int_power_of_two(tmp, internal_count_bits(a)) or_return
  1846. internal_sub(tmp, tmp, a) or_return
  1847. return tmp.digit[0], nil
  1848. }
  1849. /*
  1850. Determines the setup value.
  1851. Assumes `mu` and `P` are not `nil`.
  1852. d := (1 << a.bits) - a;
  1853. */
  1854. _private_int_reduce_2k_setup_l :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
  1855. context.allocator = allocator
  1856. tmp := &Int{}
  1857. defer internal_destroy(tmp)
  1858. internal_zero(tmp) or_return
  1859. internal_int_power_of_two(tmp, internal_count_bits(P)) or_return
  1860. internal_sub(mu, tmp, P) or_return
  1861. return nil
  1862. }
  1863. /*
  1864. Pre-calculate the value required for Barrett reduction.
  1865. For a given modulus "P" it calulates the value required in "mu"
  1866. Assumes `mu` and `P` are not `nil`.
  1867. */
  1868. _private_int_reduce_setup :: proc(mu, P: ^Int, allocator := context.allocator) -> (err: Error) {
  1869. context.allocator = allocator
  1870. internal_int_power_of_two(mu, P.used * 2 * _DIGIT_BITS) or_return
  1871. return internal_int_div(mu, mu, P)
  1872. }
  1873. /*
  1874. Determines the setup value.
  1875. Assumes `a` to not be `nil` and to have been initialized.
  1876. */
  1877. _private_int_dr_setup :: proc(a: ^Int) -> (d: DIGIT) {
  1878. /*
  1879. The casts are required if _DIGIT_BITS is one less than
  1880. the number of bits in a DIGIT [e.g. _DIGIT_BITS==31].
  1881. */
  1882. return DIGIT((1 << _DIGIT_BITS) - a.digit[0])
  1883. }
  1884. /*
  1885. Determines if a number is a valid DR modulus.
  1886. Assumes `a` to not be `nil` and to have been initialized.
  1887. */
  1888. _private_dr_is_modulus :: proc(a: ^Int) -> (res: bool) {
  1889. /*
  1890. Must be at least two digits.
  1891. */
  1892. if a.used < 2 { return false }
  1893. /*
  1894. Must be of the form b**k - a [a <= b] so all but the first digit must be equal to -1 (mod b).
  1895. */
  1896. for ix := 1; ix < a.used; ix += 1 {
  1897. if a.digit[ix] != _MASK {
  1898. return false
  1899. }
  1900. }
  1901. return true
  1902. }
  1903. /*
  1904. Reduce "x" in place modulo "n" using the Diminished Radix algorithm.
  1905. Based on algorithm from the paper
  1906. "Generating Efficient Primes for Discrete Log Cryptosystems"
  1907. Chae Hoon Lim, Pil Joong Lee,
  1908. POSTECH Information Research Laboratories
  1909. The modulus must be of a special format [see manual].
  1910. Has been modified to use algorithm 7.10 from the LTM book instead
  1911. Input x must be in the range 0 <= x <= (n-1)**2
  1912. Assumes `x` and `n` to not be `nil` and to have been initialized.
  1913. */
  1914. _private_int_dr_reduce :: proc(x, n: ^Int, k: DIGIT, allocator := context.allocator) -> (err: Error) {
  1915. /*
  1916. m = digits in modulus.
  1917. */
  1918. m := n.used
  1919. /*
  1920. Ensure that "x" has at least 2m digits.
  1921. */
  1922. internal_grow(x, m + m) or_return
  1923. /*
  1924. Top of loop, this is where the code resumes if another reduction pass is required.
  1925. */
  1926. for {
  1927. i: int
  1928. mu := DIGIT(0)
  1929. /*
  1930. Compute (x mod B**m) + k * [x/B**m] inline and inplace.
  1931. */
  1932. for i = 0; i < m; i += 1 {
  1933. r := _WORD(x.digit[i + m]) * _WORD(k) + _WORD(x.digit[i] + mu)
  1934. x.digit[i] = DIGIT(r & _WORD(_MASK))
  1935. mu = DIGIT(r >> _WORD(_DIGIT_BITS))
  1936. }
  1937. /*
  1938. Set final carry.
  1939. */
  1940. x.digit[i] = mu
  1941. /*
  1942. Zero words above m.
  1943. */
  1944. mem.zero_slice(x.digit[m + 1:][:x.used - m])
  1945. /*
  1946. Clamp, sub and return.
  1947. */
  1948. internal_clamp(x) or_return
  1949. /*
  1950. If x >= n then subtract and reduce again.
  1951. Each successive "recursion" makes the input smaller and smaller.
  1952. */
  1953. if internal_lt_abs(x, n) { break }
  1954. internal_sub(x, x, n) or_return
  1955. }
  1956. return nil
  1957. }
  1958. /*
  1959. Computes res == G**X mod P.
  1960. Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
  1961. */
  1962. _private_int_exponent_mod :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
  1963. context.allocator = allocator
  1964. M := [_TAB_SIZE]Int{}
  1965. winsize: uint
  1966. /*
  1967. Use a pointer to the reduction algorithm.
  1968. This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
  1969. */
  1970. redux: #type proc(x, m, mu: ^Int, allocator := context.allocator) -> (err: Error)
  1971. defer {
  1972. internal_destroy(&M[1])
  1973. for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
  1974. internal_destroy(&M[x])
  1975. }
  1976. }
  1977. /*
  1978. Find window size.
  1979. */
  1980. x := internal_count_bits(X)
  1981. switch {
  1982. case x <= 7:
  1983. winsize = 2
  1984. case x <= 36:
  1985. winsize = 3
  1986. case x <= 140:
  1987. winsize = 4
  1988. case x <= 450:
  1989. winsize = 5
  1990. case x <= 1303:
  1991. winsize = 6
  1992. case x <= 3529:
  1993. winsize = 7
  1994. case:
  1995. winsize = 8
  1996. }
  1997. winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize
  1998. /*
  1999. Init M array.
  2000. Init first cell.
  2001. */
  2002. internal_zero(&M[1]) or_return
  2003. /*
  2004. Now init the second half of the array.
  2005. */
  2006. for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
  2007. internal_zero(&M[x]) or_return
  2008. }
  2009. /*
  2010. Create `mu`, used for Barrett reduction.
  2011. */
  2012. mu := &Int{}
  2013. defer internal_destroy(mu)
  2014. internal_zero(mu) or_return
  2015. if redmode == 0 {
  2016. _private_int_reduce_setup(mu, P) or_return
  2017. redux = _private_int_reduce
  2018. } else {
  2019. _private_int_reduce_2k_setup_l(mu, P) or_return
  2020. redux = _private_int_reduce_2k_l
  2021. }
  2022. /*
  2023. Create M table.
  2024. The M table contains powers of the base, e.g. M[x] = G**x mod P.
  2025. The first half of the table is not computed, though, except for M[0] and M[1].
  2026. */
  2027. internal_int_mod(&M[1], G, P) or_return
  2028. /*
  2029. Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
  2030. TODO: This can probably be replaced by computing the power and using `pow` to raise to it
  2031. instead of repeated squaring.
  2032. */
  2033. slot := 1 << (winsize - 1)
  2034. internal_copy(&M[slot], &M[1]) or_return
  2035. for x = 0; x < int(winsize - 1); x += 1 {
  2036. /*
  2037. Square it.
  2038. */
  2039. internal_sqr(&M[slot], &M[slot]) or_return
  2040. /*
  2041. Reduce modulo P
  2042. */
  2043. redux(&M[slot], P, mu) or_return
  2044. }
  2045. /*
  2046. Create upper table, that is M[x] = M[x-1] * M[1] (mod P)
  2047. for x = (2**(winsize - 1) + 1) to (2**winsize - 1)
  2048. */
  2049. for x = slot + 1; x < (1 << winsize); x += 1 {
  2050. internal_mul(&M[x], &M[x - 1], &M[1]) or_return
  2051. redux(&M[x], P, mu) or_return
  2052. }
  2053. /*
  2054. Setup result.
  2055. */
  2056. internal_one(res) or_return
  2057. /*
  2058. Set initial mode and bit cnt.
  2059. */
  2060. mode := 0
  2061. bitcnt := 1
  2062. buf := DIGIT(0)
  2063. digidx := X.used - 1
  2064. bitcpy := uint(0)
  2065. bitbuf := DIGIT(0)
  2066. for {
  2067. /*
  2068. Grab next digit as required.
  2069. */
  2070. bitcnt -= 1
  2071. if bitcnt == 0 {
  2072. /*
  2073. If digidx == -1 we are out of digits.
  2074. */
  2075. if digidx == -1 { break }
  2076. /*
  2077. Read next digit and reset the bitcnt.
  2078. */
  2079. buf = X.digit[digidx]
  2080. digidx -= 1
  2081. bitcnt = _DIGIT_BITS
  2082. }
  2083. /*
  2084. Grab the next msb from the exponent.
  2085. */
  2086. y := buf >> (_DIGIT_BITS - 1) & 1
  2087. buf <<= 1
  2088. /*
  2089. If the bit is zero and mode == 0 then we ignore it.
  2090. These represent the leading zero bits before the first 1 bit
  2091. in the exponent. Technically this opt is not required but it
  2092. does lower the # of trivial squaring/reductions used.
  2093. */
  2094. if mode == 0 && y == 0 {
  2095. continue
  2096. }
  2097. /*
  2098. If the bit is zero and mode == 1 then we square.
  2099. */
  2100. if mode == 1 && y == 0 {
  2101. internal_sqr(res, res) or_return
  2102. redux(res, P, mu) or_return
  2103. continue
  2104. }
  2105. /*
  2106. Else we add it to the window.
  2107. */
  2108. bitcpy += 1
  2109. bitbuf |= (y << (winsize - bitcpy))
  2110. mode = 2
  2111. if (bitcpy == winsize) {
  2112. /*
  2113. Window is filled so square as required and multiply.
  2114. Square first.
  2115. */
  2116. for x = 0; x < int(winsize); x += 1 {
  2117. internal_sqr(res, res) or_return
  2118. redux(res, P, mu) or_return
  2119. }
  2120. /*
  2121. Then multiply.
  2122. */
  2123. internal_mul(res, res, &M[bitbuf]) or_return
  2124. redux(res, P, mu) or_return
  2125. /*
  2126. Empty window and reset.
  2127. */
  2128. bitcpy = 0
  2129. bitbuf = 0
  2130. mode = 1
  2131. }
  2132. }
  2133. /*
  2134. If bits remain then square/multiply.
  2135. */
  2136. if mode == 2 && bitcpy > 0 {
  2137. /*
  2138. Square then multiply if the bit is set.
  2139. */
  2140. for x = 0; x < int(bitcpy); x += 1 {
  2141. internal_sqr(res, res) or_return
  2142. redux(res, P, mu) or_return
  2143. bitbuf <<= 1
  2144. if ((bitbuf & (1 << winsize)) != 0) {
  2145. /*
  2146. Then multiply.
  2147. */
  2148. internal_mul(res, res, &M[1]) or_return
  2149. redux(res, P, mu) or_return
  2150. }
  2151. }
  2152. }
  2153. return err
  2154. }
  2155. /*
  2156. Computes Y == G**X mod P, HAC pp.616, Algorithm 14.85
  2157. Uses a left-to-right `k`-ary sliding window to compute the modular exponentiation.
  2158. The value of `k` changes based on the size of the exponent.
  2159. Uses Montgomery or Diminished Radix reduction [whichever appropriate]
  2160. Assumes `res`, `G`, `X` and `P` to not be `nil` and for `G`, `X` and `P` to have been initialized.
  2161. */
  2162. _private_int_exponent_mod_fast :: proc(res, G, X, P: ^Int, redmode: int, allocator := context.allocator) -> (err: Error) {
  2163. context.allocator = allocator
  2164. M := [_TAB_SIZE]Int{}
  2165. winsize: uint
  2166. /*
  2167. Use a pointer to the reduction algorithm.
  2168. This allows us to use one of many reduction algorithms without modding the guts of the code with if statements everywhere.
  2169. */
  2170. redux: #type proc(x, n: ^Int, rho: DIGIT, allocator := context.allocator) -> (err: Error)
  2171. defer {
  2172. internal_destroy(&M[1])
  2173. for x := 1 << (winsize - 1); x < (1 << winsize); x += 1 {
  2174. internal_destroy(&M[x])
  2175. }
  2176. }
  2177. /*
  2178. Find window size.
  2179. */
  2180. x := internal_count_bits(X)
  2181. switch {
  2182. case x <= 7:
  2183. winsize = 2
  2184. case x <= 36:
  2185. winsize = 3
  2186. case x <= 140:
  2187. winsize = 4
  2188. case x <= 450:
  2189. winsize = 5
  2190. case x <= 1303:
  2191. winsize = 6
  2192. case x <= 3529:
  2193. winsize = 7
  2194. case:
  2195. winsize = 8
  2196. }
  2197. winsize = min(_MAX_WIN_SIZE, winsize) if _MAX_WIN_SIZE > 0 else winsize
  2198. /*
  2199. Init M array
  2200. Init first cell.
  2201. */
  2202. cap := internal_int_allocated_cap(P)
  2203. internal_grow(&M[1], cap) or_return
  2204. /*
  2205. Now init the second half of the array.
  2206. */
  2207. for x = 1 << (winsize - 1); x < (1 << winsize); x += 1 {
  2208. internal_grow(&M[x], cap) or_return
  2209. }
  2210. /*
  2211. Determine and setup reduction code.
  2212. */
  2213. rho: DIGIT
  2214. if redmode == 0 {
  2215. /*
  2216. Now setup Montgomery.
  2217. */
  2218. rho = _private_int_montgomery_setup(P) or_return
  2219. /*
  2220. Automatically pick the comba one if available (saves quite a few calls/ifs).
  2221. */
  2222. if ((P.used * 2) + 1) < _WARRAY && P.used < _MAX_COMBA {
  2223. redux = _private_montgomery_reduce_comba
  2224. } else {
  2225. /*
  2226. Use slower baseline Montgomery method.
  2227. */
  2228. redux = _private_int_montgomery_reduce
  2229. }
  2230. } else if redmode == 1 {
  2231. /*
  2232. Setup DR reduction for moduli of the form B**k - b.
  2233. */
  2234. rho = _private_int_dr_setup(P)
  2235. redux = _private_int_dr_reduce
  2236. } else {
  2237. /*
  2238. Setup DR reduction for moduli of the form 2**k - b.
  2239. */
  2240. rho = _private_int_reduce_2k_setup(P) or_return
  2241. redux = _private_int_reduce_2k
  2242. }
  2243. /*
  2244. Setup result.
  2245. */
  2246. internal_grow(res, cap) or_return
  2247. /*
  2248. Create M table
  2249. The first half of the table is not computed, though, except for M[0] and M[1]
  2250. */
  2251. if redmode == 0 {
  2252. /*
  2253. Now we need R mod m.
  2254. */
  2255. _private_int_montgomery_calc_normalization(res, P) or_return
  2256. /*
  2257. Now set M[1] to G * R mod m.
  2258. */
  2259. internal_mulmod(&M[1], G, res, P) or_return
  2260. } else {
  2261. internal_one(res) or_return
  2262. internal_mod(&M[1], G, P) or_return
  2263. }
  2264. /*
  2265. Compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times.
  2266. */
  2267. slot := 1 << (winsize - 1)
  2268. internal_copy(&M[slot], &M[1]) or_return
  2269. for x = 0; x < int(winsize - 1); x += 1 {
  2270. internal_sqr(&M[slot], &M[slot]) or_return
  2271. redux(&M[slot], P, rho) or_return
  2272. }
  2273. /*
  2274. Create upper table.
  2275. */
  2276. for x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x += 1 {
  2277. internal_mul(&M[x], &M[x - 1], &M[1]) or_return
  2278. redux(&M[x], P, rho) or_return
  2279. }
  2280. /*
  2281. Set initial mode and bit cnt.
  2282. */
  2283. mode := 0
  2284. bitcnt := 1
  2285. buf := DIGIT(0)
  2286. digidx := X.used - 1
  2287. bitcpy := 0
  2288. bitbuf := DIGIT(0)
  2289. for {
  2290. /*
  2291. Grab next digit as required.
  2292. */
  2293. bitcnt -= 1
  2294. if bitcnt == 0 {
  2295. /*
  2296. If digidx == -1 we are out of digits so break.
  2297. */
  2298. if digidx == -1 { break }
  2299. /*
  2300. Read next digit and reset the bitcnt.
  2301. */
  2302. buf = X.digit[digidx]
  2303. digidx -= 1
  2304. bitcnt = _DIGIT_BITS
  2305. }
  2306. /*
  2307. Grab the next msb from the exponent.
  2308. */
  2309. y := (buf >> (_DIGIT_BITS - 1)) & 1
  2310. buf <<= 1
  2311. /*
  2312. If the bit is zero and mode == 0 then we ignore it.
  2313. These represent the leading zero bits before the first 1 bit in the exponent.
  2314. Technically this opt is not required but it does lower the # of trivial squaring/reductions used.
  2315. */
  2316. if mode == 0 && y == 0 { continue }
  2317. /*
  2318. If the bit is zero and mode == 1 then we square.
  2319. */
  2320. if mode == 1 && y == 0 {
  2321. internal_sqr(res, res) or_return
  2322. redux(res, P, rho) or_return
  2323. continue
  2324. }
  2325. /*
  2326. Else we add it to the window.
  2327. */
  2328. bitcpy += 1
  2329. bitbuf |= (y << (winsize - uint(bitcpy)))
  2330. mode = 2
  2331. if bitcpy == int(winsize) {
  2332. /*
  2333. Window is filled so square as required and multiply
  2334. Square first.
  2335. */
  2336. for x = 0; x < int(winsize); x += 1 {
  2337. internal_sqr(res, res) or_return
  2338. redux(res, P, rho) or_return
  2339. }
  2340. /*
  2341. Then multiply.
  2342. */
  2343. internal_mul(res, res, &M[bitbuf]) or_return
  2344. redux(res, P, rho) or_return
  2345. /*
  2346. Empty window and reset.
  2347. */
  2348. bitcpy = 0
  2349. bitbuf = 0
  2350. mode = 1
  2351. }
  2352. }
  2353. /*
  2354. If bits remain then square/multiply.
  2355. */
  2356. if mode == 2 && bitcpy > 0 {
  2357. /*
  2358. Square then multiply if the bit is set.
  2359. */
  2360. for x = 0; x < bitcpy; x += 1 {
  2361. internal_sqr(res, res) or_return
  2362. redux(res, P, rho) or_return
  2363. /*
  2364. Get next bit of the window.
  2365. */
  2366. bitbuf <<= 1
  2367. if bitbuf & (1 << winsize) != 0 {
  2368. /*
  2369. Then multiply.
  2370. */
  2371. internal_mul(res, res, &M[1]) or_return
  2372. redux(res, P, rho) or_return
  2373. }
  2374. }
  2375. }
  2376. if redmode == 0 {
  2377. /*
  2378. Fixup result if Montgomery reduction is used.
  2379. Recall that any value in a Montgomery system is actually multiplied by R mod n.
  2380. So we have to reduce one more time to cancel out the factor of R.
  2381. */
  2382. redux(res, P, rho) or_return
  2383. }
  2384. return nil
  2385. }
  2386. /*
  2387. hac 14.61, pp608
  2388. */
  2389. _private_inverse_modulo :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
  2390. context.allocator = allocator
  2391. x, y, u, v, A, B, C, D := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
  2392. defer internal_destroy(x, y, u, v, A, B, C, D)
  2393. /*
  2394. `b` cannot be negative.
  2395. */
  2396. if b.sign == .Negative || internal_is_zero(b) {
  2397. return .Invalid_Argument
  2398. }
  2399. /*
  2400. init temps.
  2401. */
  2402. internal_init_multi(x, y, u, v, A, B, C, D) or_return
  2403. /*
  2404. `x` = `a` % `b`, `y` = `b`
  2405. */
  2406. internal_mod(x, a, b) or_return
  2407. internal_copy(y, b) or_return
  2408. /*
  2409. 2. [modified] if x,y are both even then return an error!
  2410. */
  2411. if internal_is_even(x) && internal_is_even(y) {
  2412. return .Invalid_Argument
  2413. }
  2414. /*
  2415. 3. u=x, v=y, A=1, B=0, C=0, D=1
  2416. */
  2417. internal_copy(u, x) or_return
  2418. internal_copy(v, y) or_return
  2419. internal_one(A) or_return
  2420. internal_one(D) or_return
  2421. for {
  2422. /*
  2423. 4. while `u` is even do:
  2424. */
  2425. for internal_is_even(u) {
  2426. /*
  2427. 4.1 `u` = `u` / 2
  2428. */
  2429. internal_int_shr1(u, u) or_return
  2430. /*
  2431. 4.2 if `A` or `B` is odd then:
  2432. */
  2433. if internal_is_odd(A) || internal_is_odd(B) {
  2434. /*
  2435. `A` = (`A`+`y`) / 2, `B` = (`B`-`x`) / 2
  2436. */
  2437. internal_add(A, A, y) or_return
  2438. internal_add(B, B, x) or_return
  2439. }
  2440. /*
  2441. `A` = `A` / 2, `B` = `B` / 2
  2442. */
  2443. internal_int_shr1(A, A) or_return
  2444. internal_int_shr1(B, B) or_return
  2445. }
  2446. /*
  2447. 5. while `v` is even do:
  2448. */
  2449. for internal_is_even(v) {
  2450. /*
  2451. 5.1 `v` = `v` / 2
  2452. */
  2453. internal_int_shr1(v, v) or_return
  2454. /*
  2455. 5.2 if `C` or `D` is odd then:
  2456. */
  2457. if internal_is_odd(C) || internal_is_odd(D) {
  2458. /*
  2459. `C` = (`C`+`y`) / 2, `D` = (`D`-`x`) / 2
  2460. */
  2461. internal_add(C, C, y) or_return
  2462. internal_add(D, D, x) or_return
  2463. }
  2464. /*
  2465. `C` = `C` / 2, `D` = `D` / 2
  2466. */
  2467. internal_int_shr1(C, C) or_return
  2468. internal_int_shr1(D, D) or_return
  2469. }
  2470. /*
  2471. 6. if `u` >= `v` then:
  2472. */
  2473. if internal_cmp(u, v) != -1 {
  2474. /*
  2475. `u` = `u` - `v`, `A` = `A` - `C`, `B` = `B` - `D`
  2476. */
  2477. internal_sub(u, u, v) or_return
  2478. internal_sub(A, A, C) or_return
  2479. internal_sub(B, B, D) or_return
  2480. } else {
  2481. /* v - v - u, C = C - A, D = D - B */
  2482. internal_sub(v, v, u) or_return
  2483. internal_sub(C, C, A) or_return
  2484. internal_sub(D, D, B) or_return
  2485. }
  2486. /*
  2487. If not zero goto step 4
  2488. */
  2489. if internal_is_zero(u) {
  2490. break
  2491. }
  2492. }
  2493. /*
  2494. Now `a` = `C`, `b` = `D`, `gcd` == `g`*`v`
  2495. */
  2496. /*
  2497. If `v` != `1` then there is no inverse.
  2498. */
  2499. if !internal_eq(v, 1) {
  2500. return .Invalid_Argument
  2501. }
  2502. /*
  2503. If its too low.
  2504. */
  2505. if internal_is_negative(C) {
  2506. internal_add(C, C, b) or_return
  2507. }
  2508. /*
  2509. Too big.
  2510. */
  2511. if internal_gte(C, 0) {
  2512. internal_sub(C, C, b) or_return
  2513. }
  2514. /*
  2515. `C` is now the inverse.
  2516. */
  2517. swap(dest, C)
  2518. return
  2519. }
  2520. /*
  2521. Computes the modular inverse via binary extended Euclidean algorithm, that is `dest` = 1 / `a` mod `b`.
  2522. Based on slow invmod except this is optimized for the case where `b` is odd,
  2523. as per HAC Note 14.64 on pp. 610.
  2524. */
  2525. _private_inverse_modulo_odd :: proc(dest, a, b: ^Int, allocator := context.allocator) -> (err: Error) {
  2526. context.allocator = allocator
  2527. x, y, u, v, B, D := &Int{}, &Int{}, &Int{}, &Int{}, &Int{}, &Int{}
  2528. defer internal_destroy(x, y, u, v, B, D)
  2529. sign: Sign
  2530. /*
  2531. 2. [modified] `b` must be odd.
  2532. */
  2533. if internal_is_even(b) { return .Invalid_Argument }
  2534. /*
  2535. Init all our temps.
  2536. */
  2537. internal_init_multi(x, y, u, v, B, D) or_return
  2538. /*
  2539. `x` == modulus, `y` == value to invert.
  2540. */
  2541. internal_copy(x, b) or_return
  2542. /*
  2543. We need `y` = `|a|`.
  2544. */
  2545. internal_mod(y, a, b) or_return
  2546. /*
  2547. If one of `x`, `y` is zero return an error!
  2548. */
  2549. if internal_is_zero(x) || internal_is_zero(y) { return .Invalid_Argument }
  2550. /*
  2551. 3. `u` = `x`, `v` = `y`, `A` = 1, `B` = 0, `C` = 0, `D` = 1
  2552. */
  2553. internal_copy(u, x) or_return
  2554. internal_copy(v, y) or_return
  2555. internal_one(D) or_return
  2556. for {
  2557. /*
  2558. 4. while `u` is even do.
  2559. */
  2560. for internal_is_even(u) {
  2561. /*
  2562. 4.1 `u` = `u` / 2
  2563. */
  2564. internal_int_shr1(u, u) or_return
  2565. /*
  2566. 4.2 if `B` is odd then:
  2567. */
  2568. if internal_is_odd(B) {
  2569. /*
  2570. `B` = (`B` - `x`) / 2
  2571. */
  2572. internal_sub(B, B, x) or_return
  2573. }
  2574. /*
  2575. `B` = `B` / 2
  2576. */
  2577. internal_int_shr1(B, B) or_return
  2578. }
  2579. /*
  2580. 5. while `v` is even do:
  2581. */
  2582. for internal_is_even(v) {
  2583. /*
  2584. 5.1 `v` = `v` / 2
  2585. */
  2586. internal_int_shr1(v, v) or_return
  2587. /*
  2588. 5.2 if `D` is odd then:
  2589. */
  2590. if internal_is_odd(D) {
  2591. /*
  2592. `D` = (`D` - `x`) / 2
  2593. */
  2594. internal_sub(D, D, x) or_return
  2595. }
  2596. /*
  2597. `D` = `D` / 2
  2598. */
  2599. internal_int_shr1(D, D) or_return
  2600. }
  2601. /*
  2602. 6. if `u` >= `v` then:
  2603. */
  2604. if internal_cmp(u, v) != -1 {
  2605. /*
  2606. `u` = `u` - `v`, `B` = `B` - `D`
  2607. */
  2608. internal_sub(u, u, v) or_return
  2609. internal_sub(B, B, D) or_return
  2610. } else {
  2611. /*
  2612. `v` - `v` - `u`, `D` = `D` - `B`
  2613. */
  2614. internal_sub(v, v, u) or_return
  2615. internal_sub(D, D, B) or_return
  2616. }
  2617. /*
  2618. If not zero goto step 4.
  2619. */
  2620. if internal_is_zero(u) { break }
  2621. }
  2622. /*
  2623. Now `a` = C, `b` = D, gcd == g*v
  2624. */
  2625. /*
  2626. if `v` != 1 then there is no inverse
  2627. */
  2628. if internal_cmp(v, 1) != 0 {
  2629. return .Invalid_Argument
  2630. }
  2631. /*
  2632. `b` is now the inverse.
  2633. */
  2634. sign = a.sign
  2635. for internal_int_is_negative(D) {
  2636. internal_add(D, D, b) or_return
  2637. }
  2638. /*
  2639. Too big.
  2640. */
  2641. for internal_gte_abs(D, b) {
  2642. internal_sub(D, D, b) or_return
  2643. }
  2644. swap(dest, D)
  2645. dest.sign = sign
  2646. return nil
  2647. }
  2648. /*
  2649. Returns the log2 of an `Int`.
  2650. Assumes `a` not to be `nil` and to have been initialized.
  2651. Also assumes `base` is a power of two.
  2652. */
  2653. _private_log_power_of_two :: proc(a: ^Int, base: DIGIT) -> (log: int, err: Error) {
  2654. base := base
  2655. y: int
  2656. for y = 0; base & 1 == 0; {
  2657. y += 1
  2658. base >>= 1
  2659. }
  2660. log = internal_count_bits(a)
  2661. return (log - 1) / y, err
  2662. }
  2663. /*
  2664. Copies DIGITs from `src` to `dest`.
  2665. Assumes `src` and `dest` to not be `nil` and have been initialized.
  2666. */
  2667. _private_copy_digits :: proc(dest, src: ^Int, digits: int, offset := int(0)) -> (err: Error) {
  2668. digits := digits
  2669. /*
  2670. If dest == src, do nothing
  2671. */
  2672. if dest == src {
  2673. return nil
  2674. }
  2675. digits = min(digits, len(src.digit), len(dest.digit))
  2676. mem.copy_non_overlapping(&dest.digit[0], &src.digit[offset], size_of(DIGIT) * digits)
  2677. return nil
  2678. }
  2679. /*
  2680. Shift left by `digits` * _DIGIT_BITS bits.
  2681. */
  2682. _private_int_shl_leg :: proc(quotient: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
  2683. context.allocator = allocator
  2684. if digits <= 0 { return nil }
  2685. /*
  2686. No need to shift a zero.
  2687. */
  2688. if #force_inline internal_is_zero(quotient) {
  2689. return nil
  2690. }
  2691. /*
  2692. Resize `quotient` to accomodate extra digits.
  2693. */
  2694. #force_inline internal_grow(quotient, quotient.used + digits) or_return
  2695. /*
  2696. Increment the used by the shift amount then copy upwards.
  2697. */
  2698. /*
  2699. Much like `_private_int_shr_leg`, this is implemented using a sliding window,
  2700. except the window goes the other way around.
  2701. */
  2702. #no_bounds_check for x := quotient.used; x > 0; x -= 1 {
  2703. quotient.digit[x+digits-1] = quotient.digit[x-1]
  2704. }
  2705. quotient.used += digits
  2706. mem.zero_slice(quotient.digit[:digits])
  2707. return nil
  2708. }
  2709. /*
  2710. Shift right by `digits` * _DIGIT_BITS bits.
  2711. */
  2712. _private_int_shr_leg :: proc(quotient: ^Int, digits: int, allocator := context.allocator) -> (err: Error) {
  2713. context.allocator = allocator
  2714. if digits <= 0 { return nil }
  2715. /*
  2716. If digits > used simply zero and return.
  2717. */
  2718. if digits > quotient.used { return internal_zero(quotient) }
  2719. /*
  2720. Much like `int_shl_digit`, this is implemented using a sliding window,
  2721. except the window goes the other way around.
  2722. b-2 | b-1 | b0 | b1 | b2 | ... | bb | ---->
  2723. /\ | ---->
  2724. \-------------------/ ---->
  2725. */
  2726. #no_bounds_check for x := 0; x < (quotient.used - digits); x += 1 {
  2727. quotient.digit[x] = quotient.digit[x + digits]
  2728. }
  2729. quotient.used -= digits
  2730. internal_zero_unused(quotient)
  2731. return internal_clamp(quotient)
  2732. }
  2733. /*
  2734. ======================== End of private procedures =======================
  2735. =============================== Private tables ===============================
  2736. Tables used by `internal_*` and `_*`.
  2737. */
  2738. _private_int_rem_128 := [?]DIGIT{
  2739. 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
  2740. 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
  2741. 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
  2742. 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
  2743. 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
  2744. 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
  2745. 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
  2746. 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
  2747. }
  2748. #assert(128 * size_of(DIGIT) == size_of(_private_int_rem_128))
  2749. _private_int_rem_105 := [?]DIGIT{
  2750. 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
  2751. 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
  2752. 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
  2753. 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
  2754. 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
  2755. 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
  2756. 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
  2757. }
  2758. #assert(105 * size_of(DIGIT) == size_of(_private_int_rem_105))
  2759. _PRIME_TAB_SIZE :: 256
  2760. _private_prime_table := [_PRIME_TAB_SIZE]DIGIT{
  2761. 0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
  2762. 0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
  2763. 0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
  2764. 0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
  2765. 0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
  2766. 0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
  2767. 0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
  2768. 0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
  2769. 0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
  2770. 0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
  2771. 0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
  2772. 0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
  2773. 0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
  2774. 0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
  2775. 0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
  2776. 0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
  2777. 0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
  2778. 0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
  2779. 0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
  2780. 0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
  2781. 0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
  2782. 0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
  2783. 0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
  2784. 0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
  2785. 0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
  2786. 0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
  2787. 0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
  2788. 0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
  2789. 0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
  2790. 0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
  2791. 0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
  2792. 0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653,
  2793. }
  2794. #assert(_PRIME_TAB_SIZE * size_of(DIGIT) == size_of(_private_prime_table))
  2795. when MATH_BIG_FORCE_64_BIT || (!MATH_BIG_FORCE_32_BIT && size_of(rawptr) == 8) {
  2796. _factorial_table := [35]_WORD{
  2797. /* f(00): */ 1,
  2798. /* f(01): */ 1,
  2799. /* f(02): */ 2,
  2800. /* f(03): */ 6,
  2801. /* f(04): */ 24,
  2802. /* f(05): */ 120,
  2803. /* f(06): */ 720,
  2804. /* f(07): */ 5_040,
  2805. /* f(08): */ 40_320,
  2806. /* f(09): */ 362_880,
  2807. /* f(10): */ 3_628_800,
  2808. /* f(11): */ 39_916_800,
  2809. /* f(12): */ 479_001_600,
  2810. /* f(13): */ 6_227_020_800,
  2811. /* f(14): */ 87_178_291_200,
  2812. /* f(15): */ 1_307_674_368_000,
  2813. /* f(16): */ 20_922_789_888_000,
  2814. /* f(17): */ 355_687_428_096_000,
  2815. /* f(18): */ 6_402_373_705_728_000,
  2816. /* f(19): */ 121_645_100_408_832_000,
  2817. /* f(20): */ 2_432_902_008_176_640_000,
  2818. /* f(21): */ 51_090_942_171_709_440_000,
  2819. /* f(22): */ 1_124_000_727_777_607_680_000,
  2820. /* f(23): */ 25_852_016_738_884_976_640_000,
  2821. /* f(24): */ 620_448_401_733_239_439_360_000,
  2822. /* f(25): */ 15_511_210_043_330_985_984_000_000,
  2823. /* f(26): */ 403_291_461_126_605_635_584_000_000,
  2824. /* f(27): */ 10_888_869_450_418_352_160_768_000_000,
  2825. /* f(28): */ 304_888_344_611_713_860_501_504_000_000,
  2826. /* f(29): */ 8_841_761_993_739_701_954_543_616_000_000,
  2827. /* f(30): */ 265_252_859_812_191_058_636_308_480_000_000,
  2828. /* f(31): */ 8_222_838_654_177_922_817_725_562_880_000_000,
  2829. /* f(32): */ 263_130_836_933_693_530_167_218_012_160_000_000,
  2830. /* f(33): */ 8_683_317_618_811_886_495_518_194_401_280_000_000,
  2831. /* f(34): */ 295_232_799_039_604_140_847_618_609_643_520_000_000,
  2832. }
  2833. } else {
  2834. _factorial_table := [21]_WORD{
  2835. /* f(00): */ 1,
  2836. /* f(01): */ 1,
  2837. /* f(02): */ 2,
  2838. /* f(03): */ 6,
  2839. /* f(04): */ 24,
  2840. /* f(05): */ 120,
  2841. /* f(06): */ 720,
  2842. /* f(07): */ 5_040,
  2843. /* f(08): */ 40_320,
  2844. /* f(09): */ 362_880,
  2845. /* f(10): */ 3_628_800,
  2846. /* f(11): */ 39_916_800,
  2847. /* f(12): */ 479_001_600,
  2848. /* f(13): */ 6_227_020_800,
  2849. /* f(14): */ 87_178_291_200,
  2850. /* f(15): */ 1_307_674_368_000,
  2851. /* f(16): */ 20_922_789_888_000,
  2852. /* f(17): */ 355_687_428_096_000,
  2853. /* f(18): */ 6_402_373_705_728_000,
  2854. /* f(19): */ 121_645_100_408_832_000,
  2855. /* f(20): */ 2_432_902_008_176_640_000,
  2856. }
  2857. }
  2858. /*
  2859. ========================= End of private tables ========================
  2860. */