jfdctfst.pas 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. {$IFNDEF FPC_DOTTEDUNITS}
  2. Unit JFDctFst;
  3. {$ENDIF FPC_DOTTEDUNITS}
  4. { This file contains a fast, not so accurate integer implementation of the
  5. forward DCT (Discrete Cosine Transform).
  6. A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
  7. on each column. Direct algorithms are also available, but they are
  8. much more complex and seem not to be any faster when reduced to code.
  9. This implementation is based on Arai, Agui, and Nakajima's algorithm for
  10. scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in
  11. Japanese, but the algorithm is described in the Pennebaker & Mitchell
  12. JPEG textbook (see REFERENCES section in file README). The following code
  13. is based directly on figure 4-8 in P&M.
  14. While an 8-point DCT cannot be done in less than 11 multiplies, it is
  15. possible to arrange the computation so that many of the multiplies are
  16. simple scalings of the final outputs. These multiplies can then be
  17. folded into the multiplications or divisions by the JPEG quantization
  18. table entries. The AA&N method leaves only 5 multiplies and 29 adds
  19. to be done in the DCT itself.
  20. The primary disadvantage of this method is that with fixed-point math,
  21. accuracy is lost due to imprecise representation of the scaled
  22. quantization values. The smaller the quantization table entry, the less
  23. precise the scaled value, so this implementation does worse with high-
  24. quality-setting files than with low-quality ones. }
  25. { Original: jfdctfst.c ; Copyright (C) 1994-1996, Thomas G. Lane. }
  26. interface
  27. {$I jconfig.inc}
  28. {$IFDEF FPC_DOTTEDUNITS}
  29. uses
  30. System.Jpeg.Jmorecfg,
  31. System.Jpeg.Jinclude,
  32. System.Jpeg.Jpeglib,
  33. System.Jpeg.Jdct; { Private declarations for DCT subsystem }
  34. {$ELSE FPC_DOTTEDUNITS}
  35. uses
  36. jmorecfg,
  37. jinclude,
  38. jpeglib,
  39. jdct; { Private declarations for DCT subsystem }
  40. {$ENDIF FPC_DOTTEDUNITS}
  41. { Perform the forward DCT on one block of samples. }
  42. {GLOBAL}
  43. procedure jpeg_fdct_ifast (var data : array of DCTELEM);
  44. implementation
  45. { This module is specialized to the case DCTSIZE = 8. }
  46. {$ifndef DCTSIZE_IS_8}
  47. Sorry, this code only copes with 8x8 DCTs. { deliberate syntax err }
  48. {$endif}
  49. { Scaling decisions are generally the same as in the LL&M algorithm;
  50. see jfdctint.c for more details. However, we choose to descale
  51. (right shift) multiplication products as soon as they are formed,
  52. rather than carrying additional fractional bits into subsequent additions.
  53. This compromises accuracy slightly, but it lets us save a few shifts.
  54. More importantly, 16-bit arithmetic is then adequate (for 8-bit samples)
  55. everywhere except in the multiplications proper; this saves a good deal
  56. of work on 16-bit-int machines.
  57. Again to save a few shifts, the intermediate results between pass 1 and
  58. pass 2 are not upscaled, but are represented only to integral precision.
  59. A final compromise is to represent the multiplicative constants to only
  60. 8 fractional bits, rather than 13. This saves some shifting work on some
  61. machines, and may also reduce the cost of multiplication (since there
  62. are fewer one-bits in the constants). }
  63. const
  64. CONST_BITS = 8;
  65. const
  66. CONST_SCALE = (INT32(1) shl CONST_BITS);
  67. const
  68. FIX_0_382683433 = INT32(Round(CONST_SCALE * 0.382683433)); {98}
  69. FIX_0_541196100 = INT32(Round(CONST_SCALE * 0.541196100)); {139}
  70. FIX_0_707106781 = INT32(Round(CONST_SCALE * 0.707106781)); {181}
  71. FIX_1_306562965 = INT32(Round(CONST_SCALE * 1.306562965)); {334}
  72. { Descale and correctly round an INT32 value that's scaled by N bits.
  73. We assume RIGHT_SHIFT rounds towards minus infinity, so adding
  74. the fudge factor is correct for either sign of X. }
  75. function DESCALE(x : INT32; n : int) : INT32;
  76. var
  77. shift_temp : INT32;
  78. begin
  79. { We can gain a little more speed, with a further compromise in accuracy,
  80. by omitting the addition in a descaling shift. This yields an incorrectly
  81. rounded result half the time... }
  82. {$ifndef USE_ACCURATE_ROUNDING}
  83. shift_temp := x;
  84. {$else}
  85. shift_temp := x + (INT32(1) shl (n-1));
  86. {$endif}
  87. {$ifdef RIGHT_SHIFT_IS_UNSIGNED}
  88. if shift_temp < 0 then
  89. Descale := (shift_temp shr n) or ((not INT32(0)) shl (32-n))
  90. else
  91. {$endif}
  92. Descale := (shift_temp shr n);
  93. end;
  94. { Multiply a DCTELEM variable by an INT32 constant, and immediately
  95. descale to yield a DCTELEM result. }
  96. function MULTIPLY(X : DCTELEM; Y: INT32): DCTELEM;
  97. begin
  98. Multiply := DeScale((X) * (Y), CONST_BITS);
  99. end;
  100. { Perform the forward DCT on one block of samples. }
  101. {GLOBAL}
  102. procedure jpeg_fdct_ifast (var data : array of DCTELEM);
  103. type
  104. PWorkspace = ^TWorkspace;
  105. TWorkspace = array [0..DCTSIZE2-1] of DCTELEM;
  106. var
  107. tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 : DCTELEM;
  108. tmp10, tmp11, tmp12, tmp13 : DCTELEM;
  109. z1, z2, z3, z4, z5, z11, z13 : DCTELEM;
  110. dataptr : PWorkspace;
  111. ctr : int;
  112. {SHIFT_TEMPS}
  113. begin
  114. { Pass 1: process rows. }
  115. dataptr := PWorkspace(@data);
  116. for ctr := DCTSIZE-1 downto 0 do
  117. begin
  118. tmp0 := dataptr^[0] + dataptr^[7];
  119. tmp7 := dataptr^[0] - dataptr^[7];
  120. tmp1 := dataptr^[1] + dataptr^[6];
  121. tmp6 := dataptr^[1] - dataptr^[6];
  122. tmp2 := dataptr^[2] + dataptr^[5];
  123. tmp5 := dataptr^[2] - dataptr^[5];
  124. tmp3 := dataptr^[3] + dataptr^[4];
  125. tmp4 := dataptr^[3] - dataptr^[4];
  126. { Even part }
  127. tmp10 := tmp0 + tmp3; { phase 2 }
  128. tmp13 := tmp0 - tmp3;
  129. tmp11 := tmp1 + tmp2;
  130. tmp12 := tmp1 - tmp2;
  131. dataptr^[0] := tmp10 + tmp11; { phase 3 }
  132. dataptr^[4] := tmp10 - tmp11;
  133. z1 := MULTIPLY(tmp12 + tmp13, FIX_0_707106781); { c4 }
  134. dataptr^[2] := tmp13 + z1; { phase 5 }
  135. dataptr^[6] := tmp13 - z1;
  136. { Odd part }
  137. tmp10 := tmp4 + tmp5; { phase 2 }
  138. tmp11 := tmp5 + tmp6;
  139. tmp12 := tmp6 + tmp7;
  140. { The rotator is modified from fig 4-8 to avoid extra negations. }
  141. z5 := MULTIPLY(tmp10 - tmp12, FIX_0_382683433); { c6 }
  142. z2 := MULTIPLY(tmp10, FIX_0_541196100) + z5; { c2-c6 }
  143. z4 := MULTIPLY(tmp12, FIX_1_306562965) + z5; { c2+c6 }
  144. z3 := MULTIPLY(tmp11, FIX_0_707106781); { c4 }
  145. z11 := tmp7 + z3; { phase 5 }
  146. z13 := tmp7 - z3;
  147. dataptr^[5] := z13 + z2; { phase 6 }
  148. dataptr^[3] := z13 - z2;
  149. dataptr^[1] := z11 + z4;
  150. dataptr^[7] := z11 - z4;
  151. Inc(DCTELEMPTR(dataptr), DCTSIZE); { advance pointer to next row }
  152. end;
  153. { Pass 2: process columns. }
  154. dataptr := PWorkspace(@data);
  155. for ctr := DCTSIZE-1 downto 0 do
  156. begin
  157. tmp0 := dataptr^[DCTSIZE*0] + dataptr^[DCTSIZE*7];
  158. tmp7 := dataptr^[DCTSIZE*0] - dataptr^[DCTSIZE*7];
  159. tmp1 := dataptr^[DCTSIZE*1] + dataptr^[DCTSIZE*6];
  160. tmp6 := dataptr^[DCTSIZE*1] - dataptr^[DCTSIZE*6];
  161. tmp2 := dataptr^[DCTSIZE*2] + dataptr^[DCTSIZE*5];
  162. tmp5 := dataptr^[DCTSIZE*2] - dataptr^[DCTSIZE*5];
  163. tmp3 := dataptr^[DCTSIZE*3] + dataptr^[DCTSIZE*4];
  164. tmp4 := dataptr^[DCTSIZE*3] - dataptr^[DCTSIZE*4];
  165. { Even part }
  166. tmp10 := tmp0 + tmp3; { phase 2 }
  167. tmp13 := tmp0 - tmp3;
  168. tmp11 := tmp1 + tmp2;
  169. tmp12 := tmp1 - tmp2;
  170. dataptr^[DCTSIZE*0] := tmp10 + tmp11; { phase 3 }
  171. dataptr^[DCTSIZE*4] := tmp10 - tmp11;
  172. z1 := MULTIPLY(tmp12 + tmp13, FIX_0_707106781); { c4 }
  173. dataptr^[DCTSIZE*2] := tmp13 + z1; { phase 5 }
  174. dataptr^[DCTSIZE*6] := tmp13 - z1;
  175. { Odd part }
  176. tmp10 := tmp4 + tmp5; { phase 2 }
  177. tmp11 := tmp5 + tmp6;
  178. tmp12 := tmp6 + tmp7;
  179. { The rotator is modified from fig 4-8 to avoid extra negations. }
  180. z5 := MULTIPLY(tmp10 - tmp12, FIX_0_382683433); { c6 }
  181. z2 := MULTIPLY(tmp10, FIX_0_541196100) + z5; { c2-c6 }
  182. z4 := MULTIPLY(tmp12, FIX_1_306562965) + z5; { c2+c6 }
  183. z3 := MULTIPLY(tmp11, FIX_0_707106781); { c4 }
  184. z11 := tmp7 + z3; { phase 5 }
  185. z13 := tmp7 - z3;
  186. dataptr^[DCTSIZE*5] := z13 + z2; { phase 6 }
  187. dataptr^[DCTSIZE*3] := z13 - z2;
  188. dataptr^[DCTSIZE*1] := z11 + z4;
  189. dataptr^[DCTSIZE*7] := z11 - z4;
  190. Inc(DCTELEMPTR(dataptr)); { advance pointer to next column }
  191. end;
  192. end;
  193. end.