GR32.Transpose.pas 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772
  1. unit GR32.Transpose;
  2. (* ***** BEGIN LICENSE BLOCK *****
  3. * Version: MPL 1.1 or LGPL 2.1 with linking exception
  4. *
  5. * The contents of this file are subject to the Mozilla Public License Version
  6. * 1.1 (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. * http://www.mozilla.org/MPL/
  9. *
  10. * Software distributed under the License is distributed on an "AS IS" basis,
  11. * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12. * for the specific language governing rights and limitations under the
  13. * License.
  14. *
  15. * Alternatively, the contents of this file may be used under the terms of the
  16. * Free Pascal modified version of the GNU Lesser General Public License
  17. * Version 2.1 (the "FPC modified LGPL License"), in which case the provisions
  18. * of this license are applicable instead of those above.
  19. * Please see the file LICENSE.txt for additional information concerning this
  20. * license.
  21. *
  22. * The Original Code is Transpose for Graphics32
  23. *
  24. * The Initial Developers of the Original Code are
  25. * Anders Melander <[email protected]>
  26. * Mattias Andersson <[email protected]>
  27. *
  28. * Portions created by the Initial Developer are Copyright (C) 2010
  29. * the Initial Developer. All Rights Reserved.
  30. *
  31. * ***** END LICENSE BLOCK ***** *)
  32. interface
  33. {$include GR32.inc}
  34. // Define USE_GLOBALBUFFER to use a shared, global block buffer in CacheObliviousTransposeEx32
  35. {$define USE_GLOBALBUFFER}
  36. // Define USE_MOVE to use Move() instead of MoveLongword()
  37. {$define USE_MOVE}
  38. uses
  39. Classes,
  40. SyncObjs,
  41. GR32,
  42. GR32_LowLevel,
  43. GR32_Bindings;
  44. //------------------------------------------------------------------------------
  45. //
  46. // Low level transpose API
  47. //
  48. //------------------------------------------------------------------------------
  49. type
  50. TTranspose32 = procedure(Src, Dst: Pointer; SrcWidth, SrcHeight: integer);
  51. //------------------------------------------------------------------------------
  52. //
  53. // TBitmap32 transpose routines
  54. //
  55. //------------------------------------------------------------------------------
  56. procedure Transpose32(Src, Dst: TBitmap32); overload; {$IFDEF USEINLINING} inline; {$ENDIF}
  57. procedure Transpose32(Src, Dst: Pointer; SrcWidth, SrcHeight: integer); overload; {$IFDEF USEINLINING} inline; {$ENDIF}
  58. //------------------------------------------------------------------------------
  59. //
  60. // Reference functions
  61. //
  62. //------------------------------------------------------------------------------
  63. procedure ReferenceTranspose32(Src, Dst: Pointer; Width, Height: integer);
  64. //------------------------------------------------------------------------------
  65. //
  66. // Transpose implementations
  67. //
  68. //------------------------------------------------------------------------------
  69. // Generally you will not use these directly. Instead use the Transpose32
  70. // functions.
  71. //------------------------------------------------------------------------------
  72. procedure CacheObliviousTranspose32(Src, Dst: pointer; Width, Height: integer);
  73. procedure CacheObliviousTransposeEx32(Src, Dst: pointer; Width, Height: integer);
  74. {$if (not defined(PUREPASCAL)) and (not defined(OMIT_SSE2))}
  75. procedure SuperDuperTranspose32(Src, Dst: Pointer; W, Height: integer);
  76. {$ifend}
  77. //------------------------------------------------------------------------------
  78. //
  79. // Bindings
  80. //
  81. //------------------------------------------------------------------------------
  82. var
  83. _Transpose32: TTranspose32;
  84. var
  85. TransposeRegistry: TFunctionRegistry;
  86. //------------------------------------------------------------------------------
  87. //------------------------------------------------------------------------------
  88. //------------------------------------------------------------------------------
  89. implementation
  90. //------------------------------------------------------------------------------
  91. //
  92. // TBitmap32 transpose routine
  93. //
  94. //------------------------------------------------------------------------------
  95. procedure Transpose32(Src, Dst: TBitmap32);
  96. begin
  97. Dst.SetSize(Src.Height, Src.Width);
  98. _Transpose32(Src.Bits, Dst.Bits, Src.Width, Src.Height);
  99. end;
  100. procedure Transpose32(Src, Dst: Pointer; SrcWidth, SrcHeight: integer);
  101. begin
  102. _Transpose32(Src, Dst, SrcWidth, SrcHeight);
  103. end;
  104. //------------------------------------------------------------------------------
  105. //
  106. // SuperDuperTranspose32
  107. //
  108. //------------------------------------------------------------------------------
  109. // Based on:
  110. //
  111. // - MatrixTranspose by AW
  112. // http://masm32.com/board/index.php?topic=6140.msg65145#msg65145
  113. //
  114. // - 4x4 matrix transpose by Siekmanski
  115. // http://masm32.com/board/index.php?topic=6127.msg65026#msg65026
  116. //
  117. // Ported to Delphi by Anders Melander
  118. //------------------------------------------------------------------------------
  119. {$if (not defined(PUREPASCAL)) and (not defined(OMIT_SSE2))}
  120. procedure SuperDuperTranspose32(Src, Dst: Pointer; W, Height: integer); //{$IFDEF FPC}assembler;{$ENDIF}
  121. // TODO : This has become a mess. Split into separate x86 and x64 implementations.
  122. type
  123. dword = Cardinal;
  124. // Parameters (x86):
  125. // EAX <- Source
  126. // EDX <- Destination
  127. // ECX <- Width
  128. // Stack[0] <- Height
  129. // Preserves: EDI, ESI, EBX
  130. //
  131. // Parameters (x64):
  132. // RCX <- Source
  133. // RDX <- Destination
  134. // R8 <- Width
  135. // R9 <- Height
  136. // Preserves: RDI, RSI, RBX, XMM4, XMM5, XMM6
  137. var
  138. Source, Destination: Pointer;
  139. {$if defined(TARGET_x86)}
  140. Width: dword;
  141. {$ifend}
  142. X4x4Required: dword;
  143. Y4x4Required: dword;
  144. remainderX: dword;
  145. remainderY: dword;
  146. {$if defined(TARGET_x86)}
  147. destRowSize: dword; // R10
  148. sourceRowSize: dword; // R11
  149. {$ifend}
  150. savedDest: Pointer;
  151. {$if defined(TARGET_x64) and defined(FPC)}begin{$ifend}
  152. asm
  153. {$if defined(TARGET_x64)}
  154. {$IFNDEF FPC}
  155. .PUSHNV RDI
  156. .PUSHNV RSI
  157. .PUSHNV RBX
  158. .SAVENV XMM4
  159. .SAVENV XMM5
  160. .SAVENV XMM6
  161. {$ELSE}
  162. push RDI
  163. push RSI
  164. push RBX
  165. {$ENDIF}
  166. {$elseif defined(TARGET_x86)}
  167. push edi
  168. push esi
  169. push ebx
  170. {$else}
  171. {$message fatal 'Unsupported target'}
  172. {$ifend}
  173. {$if defined(TARGET_x64)}
  174. {$elseif defined(TARGET_x86)}
  175. {$ifend}
  176. {$if defined(TARGET_x64)}
  177. mov Destination, RDX
  178. mov Source, RCX
  179. {$elseif defined(TARGET_x86)}
  180. mov Destination, Dst
  181. mov Source, Src
  182. {$ifend}
  183. {$if defined(TARGET_x86)}
  184. mov Width, W
  185. {$ifend}
  186. // How many cols % 4?
  187. mov eax, W
  188. mov ebx, 4
  189. mov edx, 0
  190. div ebx
  191. mov X4x4Required, eax
  192. mov remainderX, edx
  193. // How many rows %4?
  194. mov eax, Height
  195. mov ebx, 4
  196. mov edx, 0
  197. div ebx
  198. mov Y4x4Required, eax
  199. mov remainderY, edx
  200. mov eax, Height
  201. shl eax, 2
  202. {$if defined(TARGET_x86)}
  203. mov destRowSize, eax
  204. {$elseif defined(TARGET_x64)}
  205. mov r10, rax
  206. {$ifend}
  207. {$if defined(TARGET_x86)}
  208. mov eax, Width
  209. {$elseif defined(TARGET_x64)}
  210. mov eax, W
  211. {$ifend}
  212. shl eax, 2
  213. {$if defined(TARGET_x86)}
  214. mov sourceRowSize, eax
  215. {$elseif defined(TARGET_x64)}
  216. mov r11, rax
  217. {$ifend}
  218. mov ebx, 0
  219. @@loop1outer:
  220. cmp ebx, Y4x4Required // while ebx<Y4x4Required // Height % 4
  221. jae @@loop1outer_exit
  222. // find starting point for source
  223. mov eax, ebx
  224. {$if defined(TARGET_x86)}
  225. mul sourceRowSize
  226. {$elseif defined(TARGET_x64)}
  227. mul r11
  228. {$ifend}
  229. shl eax, 2
  230. {$if defined(TARGET_x86)}
  231. mov esi, Source
  232. add esi, eax
  233. mov ecx, esi // save
  234. {$elseif defined(TARGET_x64)}
  235. mov rsi, Source
  236. add rsi, rax
  237. mov rcx, rsi // save
  238. {$ifend}
  239. // find starting point for destination
  240. mov eax, ebx
  241. shl eax, 4
  242. {$if defined(TARGET_x86)}
  243. mov edi, Destination
  244. add edi, eax
  245. mov savedDest, edi // save
  246. push ebx
  247. {$elseif defined(TARGET_x64)}
  248. mov rdi, Destination
  249. add rdi, rax
  250. mov savedDest, rdi // save
  251. push rbx
  252. {$ifend}
  253. mov ebx, 0
  254. @@loop1inner:
  255. cmp ebx, X4x4Required// while ebx<X4x4Required
  256. jae @@loop1inner_exit
  257. mov eax, ebx
  258. shl eax, 4
  259. {$if defined(TARGET_x86)}
  260. mov esi, ecx
  261. add esi, eax
  262. movups xmm0, [esi]
  263. add esi, sourceRowSize
  264. movups xmm1, [esi]
  265. add esi, sourceRowSize
  266. movups xmm2, [esi]
  267. add esi, sourceRowSize
  268. movups xmm3, [esi]
  269. {$elseif defined(TARGET_x64)}
  270. mov rsi, rcx
  271. add rsi, rax
  272. movups xmm0, [rsi]
  273. add rsi, r11
  274. movups xmm1, [rsi]
  275. add rsi, r11
  276. movups xmm2, [rsi]
  277. add rsi, r11
  278. movups xmm3, [rsi]
  279. {$ifend}
  280. movaps xmm4,xmm0
  281. movaps xmm5,xmm2
  282. unpcklps xmm4,xmm1
  283. unpcklps xmm5,xmm3
  284. unpckhps xmm0,xmm1
  285. unpckhps xmm2,xmm3
  286. movaps xmm1,xmm4
  287. movaps xmm6,xmm0
  288. movlhps xmm4,xmm5
  289. movlhps xmm6,xmm2
  290. movhlps xmm5,xmm1
  291. movhlps xmm2,xmm0
  292. {$if defined(TARGET_x86)}
  293. mov eax, destRowSize
  294. {$elseif defined(TARGET_x64)}
  295. mov rax, r10
  296. {$ifend}
  297. shl eax, 2
  298. mul ebx
  299. {$if defined(TARGET_x86)}
  300. mov edi, savedDest
  301. add edi, eax
  302. {$elseif defined(TARGET_x64)}
  303. mov rdi, savedDest
  304. add rdi, rax
  305. {$ifend}
  306. {$if defined(TARGET_x86)}
  307. movups [edi], xmm4
  308. add edi, destRowSize
  309. movups [edi], xmm5
  310. add edi, destRowSize
  311. movups [edi], xmm6
  312. add edi, destRowSize
  313. movups [edi], xmm2
  314. {$elseif defined(TARGET_x64)}
  315. movups [rdi], xmm4
  316. add rdi, r10
  317. movups [rdi], xmm5
  318. add rdi, r10
  319. movups [rdi], xmm6
  320. add rdi, r10
  321. movups [rdi], xmm2
  322. {$ifend}
  323. inc ebx
  324. jmp @@loop1inner
  325. @@loop1inner_exit:
  326. {$if defined(TARGET_x86)}
  327. pop ebx
  328. {$elseif defined(TARGET_x64)}
  329. pop rbx
  330. {$ifend}
  331. inc ebx
  332. jmp @@loop1outer
  333. @@loop1outer_exit:
  334. // deal with Height not multiple of 4
  335. cmp remainderX, 1 // .if remainderX >=1
  336. jb @@no_extra_x
  337. mov eax, X4x4Required
  338. shl eax, 4
  339. {$if defined(TARGET_x86)}
  340. mov esi, Source
  341. add esi, eax
  342. {$elseif defined(TARGET_x64)}
  343. mov rsi, Source
  344. add rsi, rax
  345. {$ifend}
  346. mov eax, X4x4Required
  347. shl eax, 2
  348. {$if defined(TARGET_x86)}
  349. mul destRowSize
  350. mov edi, Destination
  351. add edi, eax
  352. {$elseif defined(TARGET_x64)}
  353. mul r10
  354. mov rdi, Destination
  355. add rdi, rax
  356. {$ifend}
  357. mov edx, 0
  358. @@extra_x:
  359. cmp edx, remainderX // while edx < remainderX
  360. jae @@extra_x_exit
  361. mov ecx, 0
  362. mov eax, 0
  363. @@extra_x_y:
  364. cmp ecx, Height // while ecx < Height
  365. jae @@extra_x_y_exit
  366. {$if defined(TARGET_x86)}
  367. mov ebx, dword ptr [esi+eax]
  368. mov dword ptr [edi+4*ecx], ebx
  369. {$elseif defined(TARGET_x64)}
  370. mov ebx, dword ptr [rsi+rax]
  371. mov dword ptr [rdi+4*rcx], ebx
  372. {$ifend}
  373. {$if defined(TARGET_x86)}
  374. add eax, sourceRowSize
  375. {$elseif defined(TARGET_x64)}
  376. add rax, r11
  377. {$ifend}
  378. inc ecx
  379. jmp @@extra_x_y
  380. @@extra_x_y_exit:
  381. {$if defined(TARGET_x86)}
  382. add esi, 4
  383. add edi, destRowSize
  384. {$elseif defined(TARGET_x64)}
  385. add rsi, 4
  386. add rdi, r10
  387. {$ifend}
  388. inc edx
  389. jmp @@extra_x
  390. @@extra_x_exit:
  391. @@no_extra_x:
  392. // deal with columns not multiple of 4
  393. cmp remainderY, 1 // if remainderY >=1
  394. jb @@no_extra_y
  395. mov eax, Y4x4Required
  396. shl eax, 2
  397. {$if defined(TARGET_x86)}
  398. mul sourceRowSize
  399. {$elseif defined(TARGET_x64)}
  400. mul r11
  401. {$ifend}
  402. {$if defined(TARGET_x86)}
  403. mov esi, Source
  404. add esi, eax
  405. {$elseif defined(TARGET_x64)}
  406. mov rsi, Source
  407. add rsi, rax
  408. {$ifend}
  409. mov eax, Y4x4Required
  410. shl eax, 4
  411. {$if defined(TARGET_x86)}
  412. mov edi, Destination
  413. add edi, eax
  414. {$elseif defined(TARGET_x64)}
  415. mov rdi, Destination
  416. add rdi, rax
  417. {$ifend}
  418. mov edx,0
  419. @@extra_y:
  420. cmp edx, remainderY // while edx < remainderY
  421. jae @@extra_y_exit
  422. mov ecx, 0
  423. mov eax, 0
  424. @@extra_y_x:
  425. {$if defined(TARGET_x86)}
  426. cmp ecx, Width // while ecx < Width
  427. {$elseif defined(TARGET_x64)}
  428. cmp ecx, W // while ecx < Width
  429. {$ifend}
  430. jae @@extra_y_x_exit
  431. {$if defined(TARGET_x86)}
  432. mov ebx, dword ptr [esi+4*ecx]
  433. mov dword ptr [edi+eax], ebx
  434. {$elseif defined(TARGET_x64)}
  435. mov ebx, dword ptr [rsi+4*rcx]
  436. mov dword ptr [rdi+rax], ebx
  437. {$ifend}
  438. {$if defined(TARGET_x86)}
  439. add eax, destRowSize
  440. {$elseif defined(TARGET_x64)}
  441. add rax, r10
  442. {$ifend}
  443. inc ecx
  444. jmp @@extra_y_x
  445. @@extra_y_x_exit:
  446. {$if defined(TARGET_x86)}
  447. add esi, sourceRowSize
  448. add edi, 4
  449. {$elseif defined(TARGET_x64)}
  450. add rsi, r11
  451. add rdi, 4
  452. {$ifend}
  453. inc edx
  454. jmp @@extra_y
  455. @@extra_y_exit:
  456. @@no_extra_y:
  457. {$if defined(TARGET_x64)}
  458. {$IFDEF FPC}
  459. pop RDI
  460. pop RSI
  461. pop RBX
  462. {$ENDIF}
  463. {$elseif defined(TARGET_x86)}
  464. pop ebx
  465. pop esi
  466. pop edi
  467. {$ifend}
  468. {$if defined(TARGET_x64) and defined(FPC)}end['XMM4', 'XMM5', 'XMM6'];{$ifend}
  469. end;
  470. {$ifend}
  471. //------------------------------------------------------------------------------
  472. //
  473. // ReferenceTranspose32
  474. //
  475. //------------------------------------------------------------------------------
  476. // Simple, no-nonsense transpose
  477. //------------------------------------------------------------------------------
  478. procedure ReferenceTranspose32(Src, Dst: pointer; Width, Height: integer);
  479. procedure CopyRow(Src, Dst: PColor32);
  480. var
  481. x: Integer;
  482. begin
  483. for x := 0 to Width-1 do
  484. begin
  485. Dst^ := Src^;
  486. Inc(Src);
  487. Inc(Dst, Height);
  488. end;
  489. end;
  490. var
  491. y: integer;
  492. begin
  493. for y := 0 to Height-1 do
  494. begin
  495. CopyRow(Src, Dst);
  496. Inc(PColor32(Src), Width);
  497. Inc(PColor32(Dst));
  498. end;
  499. end;
  500. //------------------------------------------------------------------------------
  501. //
  502. // CacheObliviousTranspose32
  503. //
  504. //------------------------------------------------------------------------------
  505. // Recursive implementation of the cache oblivious transpose algorithm.
  506. //------------------------------------------------------------------------------
  507. // References:
  508. //
  509. // - Harald Prokop
  510. // Master Thesis, MIT, June 1999
  511. // "Cache-Oblivious Algorithms"
  512. //
  513. //------------------------------------------------------------------------------
  514. const
  515. CacheObliviousBlockSize = 32;
  516. procedure CacheObliviousTranspose32(Src, Dst: pointer; Width, Height: integer);
  517. procedure Recurse(Col, Row, ColCount, RowCount: integer);
  518. var
  519. y, x: integer;
  520. Split: integer;
  521. begin
  522. if (RowCount <= CacheObliviousBlockSize) and (ColCount <= CacheObliviousBlockSize) then
  523. begin
  524. // Transpose block
  525. for y := Row to Row+RowCount-1 do
  526. for x := Col to Col+ColCount-1 do
  527. // Dst[y, x] := Src[x, y]
  528. PColor32Array(Dst)[y + x * Height] := PColor32Array(Src)[x + y * Width];
  529. end else
  530. // Subdivide the longer side
  531. if (RowCount >= ColCount) then
  532. begin // Split vertically
  533. Split := RowCount div 2;
  534. Recurse(Col, Row, ColCount, Split);
  535. Inc(Row, Split);
  536. Dec(RowCount, Split);
  537. Recurse(Col, Row, ColCount, RowCount);
  538. end else
  539. begin // Split horizontally
  540. Split := ColCount div 2;
  541. Recurse(Col, Row, Split, RowCount);
  542. Inc(Col, Split);
  543. Dec(ColCount, Split);
  544. Recurse(Col, Row, ColCount, RowCount);
  545. end;
  546. end;
  547. begin
  548. Recurse(0, 0, Width, Height);
  549. end;
  550. //------------------------------------------------------------------------------
  551. // CacheObliviousTransposeEx32 internally transposes to a temporary block buffer
  552. // which is small enough to be cached by the CPU, and then copies from that
  553. // buffer to the destination.
  554. //------------------------------------------------------------------------------
  555. {$ifdef USE_GLOBALBUFFER}
  556. var
  557. CacheObliviousTransposeBuffer: pointer;
  558. {$endif USE_GLOBALBUFFER}
  559. procedure CacheObliviousTransposeEx32(Src, Dst: pointer; Width, Height: integer);
  560. var
  561. BlockBuffer: pointer;
  562. procedure Recurse(Src, Dst: PColor32; X, Y: integer; ColCount, RowCount: Integer);
  563. var
  564. Split: Integer;
  565. BlockX, BlockY: integer;
  566. p: PColor32;
  567. begin
  568. if (ColCount <= CacheObliviousBlockSize) and (RowCount <= CacheObliviousBlockSize) then
  569. begin
  570. // Transpose to block buffer
  571. for BlockY := 0 to RowCount-1 do
  572. for BlockX := 0 to ColCount-1 do
  573. // Dst[y, x] := Src[x, y]
  574. PColor32Array(BlockBuffer)[BlockY + BlockX * CacheObliviousBlockSize] := PColor32Array(Src)[BlockX + BlockY * Width];
  575. // Copy from block buffer
  576. p := BlockBuffer;
  577. {$ifdef USE_MOVE}
  578. RowCount := RowCount * SizeOf(TColor32); // Count is now in bytes
  579. {$endif USE_MOVE}
  580. for BlockY := 0 to ColCount-1 do
  581. begin
  582. {$ifdef USE_MOVE}
  583. Move(p^, Dst^, RowCount);
  584. {$else USE_MOVE}
  585. MoveLongword(p^, Dst^, RowCount);
  586. {$endif USE_MOVE}
  587. Inc(p, CacheObliviousBlockSize);
  588. Inc(Dst, Height);
  589. end;
  590. end else
  591. // Subdivide the longer side
  592. if (RowCount >= ColCount) then
  593. begin // Split vertically
  594. Split := RowCount div 2;
  595. Recurse(Src, Dst, X, Y, ColCount, Split);
  596. Inc(Src, Split*Width);
  597. Inc(Dst, Split);
  598. Inc(Y, Split);
  599. Dec(RowCount, Split);
  600. Recurse(Src, Dst, X, Y, ColCount, RowCount);
  601. end else
  602. begin // Split horizontally
  603. Split := ColCount div 2;
  604. Recurse(Src, Dst, X, Y, Split, RowCount);
  605. Inc(Src, Split);
  606. Inc(Dst, Split*Height);
  607. Inc(X, Split);
  608. Dec(ColCount, Split);
  609. Recurse(Src, Dst, X, Y, ColCount, RowCount);
  610. end;
  611. end;
  612. {$ifdef USE_GLOBALBUFFER}
  613. var
  614. LocalBuffer: pointer;
  615. {$endif USE_GLOBALBUFFER}
  616. begin
  617. {$ifdef USE_GLOBALBUFFER}
  618. {$ifndef FPC}
  619. BlockBuffer := TInterlocked.Exchange(CacheObliviousTransposeBuffer, nil);
  620. {$else}
  621. BlockBuffer := InterlockedExchange(CacheObliviousTransposeBuffer, nil);
  622. {$endif}
  623. if (BlockBuffer = nil) then
  624. begin
  625. GetMem(LocalBuffer, CacheObliviousBlockSize*CacheObliviousBlockSize*SizeOf(TColor32));
  626. BlockBuffer := LocalBuffer;
  627. end else
  628. LocalBuffer := nil;
  629. {$else USE_GLOBALBUFFER}
  630. GetMem(BlockBuffer, CacheObliviousBlockSize*CacheObliviousBlockSize*SizeOf(TColor32));
  631. {$endif USE_GLOBALBUFFER}
  632. Recurse(Src, Dst, 0, 0, Width, Height);
  633. {$ifdef USE_GLOBALBUFFER}
  634. if (LocalBuffer <> nil) then
  635. FreeMem(LocalBuffer)
  636. else
  637. CacheObliviousTransposeBuffer := BlockBuffer;
  638. {$else USE_GLOBALBUFFER}
  639. FreeMem(BlockBuffer)
  640. {$endif USE_GLOBALBUFFER}
  641. end;
  642. //------------------------------------------------------------------------------
  643. //
  644. // Bindings
  645. //
  646. //------------------------------------------------------------------------------
  647. procedure RegisterBindings;
  648. begin
  649. TransposeRegistry := NewRegistry('GR32.Transpose bindings');
  650. TransposeRegistry.RegisterBinding(@@_Transpose32, '_Transpose32');
  651. TransposeRegistry[@@_Transpose32].Add(@ReferenceTranspose32, [isReference]).Name := 'ReferenceTranspose32';
  652. TransposeRegistry[@@_Transpose32].Add(@CacheObliviousTranspose32, [isPascal], -16).Name := 'CacheObliviousTranspose32';
  653. TransposeRegistry[@@_Transpose32].Add(@CacheObliviousTransposeEx32, [isPascal], -32).Name := 'CacheObliviousTransposeEx32';
  654. {$if (not defined(PUREPASCAL)) and (not defined(OMIT_SSE2))}
  655. // TODO : SuperDuperTranspose32 has been profiled to be on average 3 times slower
  656. // than CacheObliviousTransposeEx32 in the Gaussian blur benchmark.
  657. // It's still vastly faster in most real-world situations so we give it priority.
  658. TransposeRegistry[@@_Transpose32].Add(@SuperDuperTranspose32, [isSSE2], -48).Name := 'SuperDuperTranspose32';
  659. {$ifend}
  660. TransposeRegistry.RebindAll;
  661. end;
  662. //------------------------------------------------------------------------------
  663. //------------------------------------------------------------------------------
  664. //------------------------------------------------------------------------------
  665. initialization
  666. RegisterBindings;
  667. {$ifdef USE_GLOBALBUFFER}
  668. GetMem(CacheObliviousTransposeBuffer, CacheObliviousBlockSize*CacheObliviousBlockSize*SizeOf(TColor32));
  669. {$endif USE_GLOBALBUFFER}
  670. finalization
  671. {$ifdef USE_GLOBALBUFFER}
  672. FreeMem(CacheObliviousTransposeBuffer);
  673. {$endif USE_GLOBALBUFFER}
  674. end.