stream.pp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431
  1. program stream;
  2. {$ifdef unix}
  3. uses baseunix,unix;
  4. {$endif}
  5. {$ifdef windows}
  6. uses windows;
  7. {$endif}
  8. {-----------------------------------------------------------------------}
  9. { Original code developed by John D. McCalpin }
  10. { Programmers: John D. McCalpin }
  11. { Joe R. Zagar }
  12. { Pascal conversion: Daniel Mantione }
  13. { }
  14. { This program measures memory transfer rates in MB/s for simple }
  15. { computational kernels coded in Pascal. }
  16. {-----------------------------------------------------------------------}
  17. { Copyright 1991-2005: John D. McCalpin }
  18. {-----------------------------------------------------------------------}
  19. { License: }
  20. { 1. You are free to use this program and/or to redistribute }
  21. { this program. }
  22. { 2. You are free to modify this program for your own use, }
  23. { including commercial use, subject to the publication }
  24. { restrictions in item 3. }
  25. { 3. You are free to publish results obtained from running this }
  26. { program, or from works that you derive from this program, }
  27. { with the following limitations: }
  28. { 3a. In order to be referred to as "STREAM benchmark results", }
  29. { published results must be in conformance to the STREAM }
  30. { Run Rules, (briefly reviewed below) published at }
  31. { http://www.cs.virginia.edu/stream/ref.html }
  32. { and incorporated herein by reference. }
  33. { As the copyright holder, John McCalpin retains the }
  34. { right to determine conformity with the Run Rules. }
  35. { 3b. Results based on modified source code or on runs not in }
  36. { accordance with the STREAM Run Rules must be clearly }
  37. { labelled whenever they are published. Examples of }
  38. { proper labelling include: }
  39. { "tuned STREAM benchmark results" }
  40. { "based on a variant of the STREAM benchmark code" }
  41. { Other comparable, clear and reasonable labelling is }
  42. { acceptable. }
  43. { 3c. Submission of results to the STREAM benchmark web site }
  44. { is encouraged, but not required. }
  45. { 4. Use of this program or creation of derived works based on this }
  46. { program constitutes acceptance of these licensing restrictions. }
  47. { 5. Absolutely no warranty is expressed or implied. }
  48. {-----------------------------------------------------------------------}
  49. { INSTRUCTIONS:
  50. *
  51. * 1) Stream requires a good bit of memory to run. Adjust the
  52. * value of 'N' (below) to give a 'timing calibration' of
  53. * at least 20 clock-ticks. This will provide rate estimates
  54. * that should be good to about 5% precision.
  55. }
  56. const N = 2000000;
  57. NTIMES = 10;
  58. OFFSET = 0;
  59. {
  60. * 3) Compile the code with full optimization. Many compilers
  61. * generate unreasonably bad code before the optimizer tightens
  62. * things up. If the results are unreasonably good, on the
  63. * other hand, the optimizer might be too smart for me!
  64. *
  65. * Try compiling with:
  66. * cc -O stream_omp.c -o stream_omp
  67. *
  68. * This is known to work on Cray, SGI, IBM, and Sun machines.
  69. *
  70. *
  71. * 4) Mail the results to [email protected]
  72. * Be sure to include:
  73. * a) computer hardware model number and software revision
  74. * b) the compiler flags
  75. * c) all of the output from the test case.
  76. * Thanks!
  77. *
  78. }
  79. const HLINE = '-------------------------------------------------------------';
  80. inf = 1/0;
  81. var a,b,c:array[0..N+OFFSET-1] of double;
  82. avgtime:array[0..3] of double = (0,0,0,0);
  83. maxtime:array[0..3] of double = (0,0,0,0);
  84. mintime:array[0..3] of double = (inf,inf,inf,inf);
  85. labels:array[0..3] of string[16]= ('Copy:',
  86. 'Scale:',
  87. 'Add:',
  88. 'Triad:');
  89. bytes:array[0..3] of cardinal = (
  90. 2 * sizeof(double) * N,
  91. 2 * sizeof(double) * N,
  92. 3 * sizeof(double) * N,
  93. 3 * sizeof(double) * N
  94. );
  95. const M=20;
  96. function min(a,b:longint):longint;inline;
  97. begin
  98. if a>b then
  99. min:=b
  100. else
  101. min:=a;
  102. end;
  103. function max(a,b:longint):longint;inline;
  104. begin
  105. if a>b then
  106. max:=a
  107. else
  108. max:=b;
  109. end;
  110. function min(a,b:double):double;inline;
  111. begin
  112. if a>b then
  113. min:=b
  114. else
  115. min:=a;
  116. end;
  117. function max(a,b:double):double;inline;
  118. begin
  119. if a>b then
  120. max:=a
  121. else
  122. max:=b;
  123. end;
  124. procedure tuned_STREAM_Copy;
  125. var j:longint;
  126. begin
  127. for j:=0 to N-1 do
  128. c[j]:=a[j];
  129. end;
  130. procedure tuned_STREAM_Scale(scalar:double);
  131. var j:longint;
  132. begin
  133. for j:=0 to N-1 do
  134. b[j]:=scalar*c[j];
  135. end;
  136. procedure tuned_STREAM_Add;
  137. var j:longint;
  138. begin
  139. for j:=0 to N-1 do
  140. c[j]:=a[j]+b[j];
  141. end;
  142. procedure tuned_STREAM_Triad(scalar:double);
  143. var j:longint;
  144. begin
  145. for j:=0 to N-1 do
  146. a[j]:=b[j]+scalar*c[j];
  147. end;
  148. {$ifdef unix}
  149. {$define have_mysecond}
  150. function mysecond:double;
  151. var tp:timeval;
  152. tzp:timezone;
  153. begin
  154. fpgettimeofday(@tp,@tzp);
  155. mysecond:=double(tp.tv_sec)+double(tp.tv_usec)*1e-6;
  156. end;
  157. {$endif}
  158. {$ifdef windows}
  159. {$define have_mysecond}
  160. function mysecond:double;
  161. begin
  162. mysecond:=gettickcount*1e-3;
  163. end;
  164. {$endif}
  165. {$ifndef have_mysecond}
  166. {$error Please implement a mysecond for your platform.}
  167. {$endif}
  168. function checktick:longint;
  169. var i,minDelta,Delta:longint;
  170. t1,t2:double;
  171. timesfound:array[0..M-1] of double;
  172. begin
  173. { Collect a sequence of M unique time values from the system. }
  174. for i:=0 to M-1 do
  175. begin
  176. t1:=mysecond;
  177. t2:=t1;
  178. while t2-t1<1E-6 do
  179. t2:=mysecond;
  180. t1:=t2;
  181. timesfound[i]:=t1;
  182. end;
  183. {
  184. * Determine the minimum difference between these M values.
  185. * This result will be our estimate (in microseconds) for the
  186. * clock granularity.
  187. }
  188. minDelta:=1000000;
  189. for i:=1 to M-1 do
  190. begin
  191. Delta:=trunc(1E6*(timesfound[i]-timesfound[i-1]));
  192. minDelta:=MIN(minDelta,MAX(Delta,0));
  193. end;
  194. checktick:=minDelta;
  195. end;
  196. procedure checkSTREAMresults;
  197. var aj,bj,cj,scalar:double;
  198. asum,bsum,csum:double;
  199. epsilon:double;
  200. j,k:longint;
  201. begin
  202. { reproduce initialization }
  203. aj:=1;
  204. bj:=2;
  205. cj:=0;
  206. { a[] is modified during timing check }
  207. aj:=2*aj;
  208. { now execute timing loop }
  209. scalar:=3;
  210. for k:=0 to NTIMES-1 do
  211. begin
  212. cj:=aj;
  213. bj:=scalar*cj;
  214. cj:=aj+bj;
  215. aj:=bj+scalar*cj;
  216. end;
  217. aj:=aj*N;
  218. bj:=bj*N;
  219. cj:=cj*N;
  220. asum:=0;
  221. bsum:=0;
  222. csum:=0;
  223. for j:=0 to N-1 do
  224. begin
  225. asum:=asum+a[j];
  226. bsum:=bsum+b[j];
  227. csum:=csum+c[j];
  228. end;
  229. {$ifdef VERBOSE}
  230. writeln('Results Comparison: ');
  231. writeln(' Expected : ',aj,' ',bj,' ',cj);
  232. writeln(' Observed : ',asum,' ',bsum,' ',csum);
  233. {$endif}
  234. epsilon:=1e-8;
  235. if abs(aj-asum)/asum>epsilon then
  236. begin
  237. writeln('Failed Validation on array a');
  238. writeln(' Expected : ',aj);
  239. writeln(' Observed : ',asum);
  240. end
  241. else if abs(bj-bsum)/bsum>epsilon then
  242. begin
  243. writeln('Failed Validation on array b');
  244. writeln(' Expected : ',bj);
  245. writeln(' Observed : ',bsum);
  246. end
  247. else if abs(cj-csum)/csum>epsilon then
  248. begin
  249. writeln('Failed Validation on array c');
  250. writeln(' Expected : ',cj);
  251. writeln(' Observed : ',csum);
  252. end
  253. else
  254. writeln('Solution Validates');
  255. end;
  256. var quantum:longint;
  257. BytesPerWord:longint;
  258. j,k:longint;
  259. scalar,t:double;
  260. times:array[0..3,0..NTIMES-1] of double;
  261. begin
  262. { --- SETUP --- determine precision and check timing --- }
  263. writeln(HLINE);
  264. writeln('STREAM version Revision: 5.6');
  265. writeln(HLINE);
  266. BytesPerWord:=sizeof(double);
  267. writeln('This system uses ',BytesPerWord,' bytes per DOUBLE PRECISION word.');
  268. writeln(HLINE);
  269. writeln('Array size = ',N,', Offset = ',OFFSET);
  270. writeln('Total memory required = ',3*BytesPerWord*(N/1048576),' MB.');
  271. writeln('Each test is run ',NTIMES,' times, but only');
  272. writeln('the *best* time for each is used.');
  273. writeln(HLINE);
  274. writeln('writelning one line per active thread....');
  275. { Get initial value for system clock. }
  276. for j:=0 to N-1 do
  277. begin
  278. a[j]:=1;
  279. b[j]:=2;
  280. c[j]:=0;
  281. end;
  282. writeln(HLINE);
  283. quantum:=checktick;
  284. if quantum>=1 then
  285. writeln('Your clock granularity/precision appears to be ',quantum,
  286. ' microseconds.')
  287. else
  288. writeln('Your clock granularity appears to be '+
  289. 'less than one microsecond.');
  290. t:=mysecond;
  291. for j:=0 to N-1 do
  292. a[j]:=2*a[j];
  293. t:=1E6*(mysecond-t);
  294. writeln('Each test below will take on the order of ',t,
  295. ' microseconds.');
  296. writeln(' (= ',t/quantum,' clock ticks)');
  297. writeln('Increase the size of the arrays if this shows that');
  298. writeln('you are not getting at least 20 clock ticks per test.');
  299. writeln(HLINE);
  300. writeln('WARNING -- The above is only a rough guideline.');
  301. writeln('For best results, please be sure you know the');
  302. writeln('precision of your system timer.');
  303. writeln(HLINE);
  304. { --- MAIN LOOP --- repeat test cases NTIMES times --- }
  305. scalar:=3;
  306. for k:=0 to NTIMES-1 do
  307. begin
  308. times[0,k]:=mysecond();
  309. {$ifdef TUNED}
  310. tuned_STREAM_Copy();
  311. {$else}
  312. for j:=0 to N-1 do
  313. c[j]:=a[j];
  314. {$endif}
  315. times[0,k]:=mysecond-times[0,k];
  316. times[1,k]:=mysecond;
  317. {$ifdef TUNED}
  318. tuned_STREAM_Scale(scalar);
  319. {$else}
  320. for j:=0 to N-1 do
  321. b[j]:=scalar*c[j];
  322. {$endif}
  323. times[1,k]:=mysecond-times[1,k];
  324. times[2,k]:=mysecond;
  325. {$ifdef TUNED}
  326. tuned_STREAM_Add();
  327. {$else}
  328. for j:=0 to N-1 do
  329. c[j]:=a[j]+b[j];
  330. {$endif}
  331. times[2,k]:=mysecond-times[2,k];
  332. times[3,k]:=mysecond;
  333. {$ifdef TUNED}
  334. tuned_STREAM_Triad(scalar);
  335. {$else}
  336. for j:=0 to N-1 do
  337. a[j]:=b[j]+scalar*c[j];
  338. {$endif}
  339. times[3,k]:=mysecond-times[3,k];
  340. end;
  341. { --- SUMMARY --- }
  342. for k:=1 to NTIMES-1 do { note -- skip first iteration }
  343. for j:=0 to 3 do
  344. begin
  345. avgtime[j]:=avgtime[j] + times[j,k];
  346. mintime[j]:=MIN(mintime[j], times[j,k]);
  347. maxtime[j]:=MAX(maxtime[j], times[j,k]);
  348. end;
  349. writeln('Function Rate (MB/s) Avg time Min time Max time');
  350. for j:=0 to 3 do
  351. begin
  352. avgtime[j]:=avgtime[j]/(NTIMES-1);
  353. writeln(labels[j]:11,
  354. 1E-6*bytes[j]/mintime[j]:11:4,
  355. avgtime[j]:11:4,
  356. mintime[j]:11:4,
  357. maxtime[j]:11:4);
  358. end;
  359. writeln(HLINE);
  360. { --- Check Results --- }
  361. checkSTREAMresults;
  362. writeln(HLINE);
  363. end.