arm.inc 30 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211
  1. {
  2. This file is part of the Free Pascal run time library.
  3. Copyright (c) 2003 by the Free Pascal development team.
  4. Processor dependent implementation for the system unit for
  5. ARM
  6. See the file COPYING.FPC, included in this distribution,
  7. for details about the copyright.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11. **********************************************************************}
  12. {$asmmode gas}
  13. {$ifndef FPC_SYSTEM_HAS_MOVE}
  14. {$define FPC_SYSTEM_FPC_MOVE}
  15. {$endif FPC_SYSTEM_HAS_MOVE}
  16. {$ifdef FPC_SYSTEM_FPC_MOVE}
  17. const
  18. cpu_has_edsp : boolean = false;
  19. in_edsp_test : boolean = false;
  20. {$endif FPC_SYSTEM_FPC_MOVE}
  21. {$if not(defined(wince)) and not(defined(gba)) and not(defined(nds)) and not(defined(FPUSOFT)) and not(defined(FPULIBGCC))}
  22. {$define FPC_SYSTEM_HAS_SYSINITFPU}
  23. {$if not defined(darwin) and not defined(FPUVFPV2) and not defined(FPUVFPV3) and not defined(FPUVFPV3_D16)}
  24. Procedure SysInitFPU;{$ifdef SYSTEMINLINE}inline;{$endif}
  25. begin
  26. { Enable FPU exceptions, but disable INEXACT, UNDERFLOW, DENORMAL }
  27. asm
  28. rfs r0
  29. and r0,r0,#0xffe0ffff
  30. orr r0,r0,#0x00070000
  31. wfs r0
  32. end;
  33. end;
  34. {$else}
  35. Procedure SysInitFPU;{$ifdef SYSTEMINLINE}inline;{$endif}
  36. begin
  37. { Enable FPU exceptions, but disable INEXACT, UNDERFLOW, DENORMAL }
  38. asm
  39. fmrx r0,fpscr
  40. // set "round to nearest" mode
  41. and r0,r0,#0xff3fffff
  42. // mask "exception happened" and overflow flags
  43. and r0,r0,#0xffffff20
  44. // mask exception flags
  45. and r0,r0,#0xffff40ff
  46. {$ifndef darwin}
  47. // Floating point exceptions cause kernel panics on iPhoneOS 2.2.1...
  48. // disable flush-to-zero mode (IEEE math compliant)
  49. and r0,r0,#0xfeffffff
  50. // enable invalid operation, div-by-zero and overflow exceptions
  51. orr r0,r0,#0x00000700
  52. {$endif}
  53. fmxr fpscr,r0
  54. end;
  55. end;
  56. {$endif}
  57. {$endif}
  58. procedure fpc_cpuinit;
  59. begin
  60. { don't let libraries influence the FPU cw set by the host program }
  61. if not IsLibrary then
  62. SysInitFPU;
  63. end;
  64. {$ifdef wince}
  65. function _controlfp(new: DWORD; mask: DWORD): DWORD; cdecl; external 'coredll';
  66. {$define FPC_SYSTEM_HAS_SYSRESETFPU}
  67. Procedure SysResetFPU;{$ifdef SYSTEMINLINE}inline;{$endif}
  68. begin
  69. end;
  70. {$define FPC_SYSTEM_HAS_SYSINITFPU}
  71. Procedure SysInitFPU;{$ifdef SYSTEMINLINE}inline;{$endif}
  72. begin
  73. { Enable FPU exceptions, but disable INEXACT, UNDERFLOW, DENORMAL }
  74. { FPU precision 64 bit, rounding to nearest, affine infinity }
  75. _controlfp($000C0003, $030F031F);
  76. end;
  77. {$endif wince}
  78. {****************************************************************************
  79. stack frame related stuff
  80. ****************************************************************************}
  81. {$IFNDEF INTERNAL_BACKTRACE}
  82. {$define FPC_SYSTEM_HAS_GET_FRAME}
  83. function get_frame:pointer;assembler;nostackframe;
  84. asm
  85. {$ifndef darwin}
  86. mov r0,r11
  87. {$else}
  88. mov r0,r7
  89. {$endif}
  90. end;
  91. {$ENDIF not INTERNAL_BACKTRACE}
  92. {$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
  93. function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;assembler;nostackframe;
  94. asm
  95. cmp r0,#0
  96. {$ifndef darwin}
  97. ldrne r0,[r0,#-4]
  98. {$else}
  99. ldrne r0,[r0,#4]
  100. {$endif}
  101. end;
  102. {$define FPC_SYSTEM_HAS_GET_CALLER_FRAME}
  103. function get_caller_frame(framebp:pointer;addr:pointer=nil):pointer;assembler;nostackframe;
  104. asm
  105. cmp r0,#0
  106. {$ifndef darwin}
  107. ldrne r0,[r0,#-12]
  108. {$else}
  109. ldrne r0,[r0]
  110. {$endif}
  111. end;
  112. {$define FPC_SYSTEM_HAS_SPTR}
  113. Function Sptr : pointer;assembler;nostackframe;
  114. asm
  115. mov r0,sp
  116. end;
  117. {$ifndef FPC_SYSTEM_HAS_FILLCHAR}
  118. {$define FPC_SYSTEM_HAS_FILLCHAR}
  119. Procedure FillChar(var x;count:longint;value:byte);assembler;nostackframe;
  120. asm
  121. // less than 0?
  122. cmp r1,#0
  123. {$ifdef CPUARM_HAS_BX}
  124. bxle lr
  125. {$else}
  126. movle pc,lr
  127. {$endif}
  128. mov r3,r0
  129. orr r2,r2,r2,lsl #8
  130. orr r2,r2,r2,lsl #16
  131. tst r3, #3 // Aligned?
  132. bne .LFillchar_do_align
  133. .LFillchar_is_aligned:
  134. subs r1,r1,#8
  135. bmi .LFillchar_less_than_8bytes
  136. mov ip,r2
  137. .LFillchar_at_least_8bytes:
  138. // Do 16 bytes per loop
  139. // More unrolling is uncessary, as we'll just stall on the write buffers
  140. stmia r3!,{r2,ip}
  141. subs r1,r1,#8
  142. stmplia r3!,{r2,ip}
  143. subpls r1,r1,#8
  144. bpl .LFillchar_at_least_8bytes
  145. .LFillchar_less_than_8bytes:
  146. // Do the rest
  147. adds r1, r1, #8
  148. {$ifdef CPUARM_HAS_BX}
  149. bxeq lr
  150. {$else}
  151. moveq pc,lr
  152. {$endif}
  153. tst r1, #4
  154. strne r2,[r3],#4
  155. tst r1, #2
  156. strneh r2,[r3],#2
  157. tst r1, #1
  158. strneb r2,[r3],#1
  159. {$ifdef CPUARM_HAS_BX}
  160. bx lr
  161. {$else}
  162. mov pc,lr
  163. {$endif}
  164. // Special case for unaligned start
  165. // We make a maximum of 3 loops here
  166. .LFillchar_do_align:
  167. strb r2,[r3],#1
  168. subs r1, r1, #1
  169. {$ifdef CPUARM_HAS_BX}
  170. bxeq lr
  171. {$else}
  172. moveq pc,lr
  173. {$endif}
  174. tst r3,#3
  175. bne .LFillchar_do_align
  176. b .LFillchar_is_aligned
  177. end;
  178. {$endif FPC_SYSTEM_HAS_FILLCHAR}
  179. {$ifndef FPC_SYSTEM_HAS_MOVE}
  180. {$define FPC_SYSTEM_HAS_MOVE}
  181. {$ifdef CPUARM_HAS_EDSP}
  182. procedure Move(const source;var dest;count:longint);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
  183. {$else CPUARM_HAS_EDSP}
  184. procedure Move_pld(const source;var dest;count:longint);assembler;nostackframe;
  185. {$endif CPUARM_HAS_EDSP}
  186. asm
  187. pld [r0]
  188. // count <=0 ?
  189. cmp r2,#0
  190. {$ifdef CPUARM_HAS_BX}
  191. bxle lr
  192. {$else}
  193. movle pc,lr
  194. {$endif}
  195. // overlap?
  196. subs r3, r1, r0 // if (dest > source) and
  197. cmphi r2, r3 // (count > dest - src) then
  198. bhi .Loverlapped // DoReverseByteCopy;
  199. cmp r2,#8 // if (count < 8) then
  200. blt .Lbyteloop // DoForwardByteCopy;
  201. // Any way to avoid the above jump and fuse the next two instructions?
  202. tst r0, #3 // if (source and 3) <> 0 or
  203. tsteq r1, #3 // (dest and 3) <> 0 then
  204. bne .Lbyteloop // DoForwardByteCopy;
  205. pld [r0,#32]
  206. .Ldwordloop:
  207. ldmia r0!, {r3, ip}
  208. // preload
  209. pld [r0,#64]
  210. sub r2,r2,#8
  211. cmp r2, #8
  212. stmia r1!, {r3, ip}
  213. bge .Ldwordloop
  214. cmp r2,#0
  215. {$ifdef CPUARM_HAS_BX}
  216. bxeq lr
  217. {$else}
  218. moveq pc,lr
  219. {$endif}
  220. .Lbyteloop:
  221. subs r2,r2,#1
  222. ldrb r3,[r0],#1
  223. strb r3,[r1],#1
  224. bne .Lbyteloop
  225. {$ifdef CPUARM_HAS_BX}
  226. bx lr
  227. {$else}
  228. mov pc,lr
  229. {$endif}
  230. .Loverlapped:
  231. subs r2,r2,#1
  232. ldrb r3,[r0,r2]
  233. strb r3,[r1,r2]
  234. bne .Loverlapped
  235. end;
  236. {$ifndef CPUARM_HAS_EDSP}
  237. procedure Move_blended(const source;var dest;count:longint);assembler;nostackframe;
  238. asm
  239. // count <=0 ?
  240. cmp r2,#0
  241. {$ifdef CPUARM_HAS_BX}
  242. bxle lr
  243. {$else}
  244. movle pc,lr
  245. {$endif}
  246. // overlap?
  247. subs r3, r1, r0 // if (dest > source) and
  248. cmphi r2, r3 // (count > dest - src) then
  249. bhi .Loverlapped // DoReverseByteCopy;
  250. cmp r2,#8 // if (count < 8) then
  251. blt .Lbyteloop // DoForwardByteCopy;
  252. // Any way to avoid the above jump and fuse the next two instructions?
  253. tst r0, #3 // if (source and 3) <> 0 or
  254. tsteq r1, #3 // (dest and 3) <> 0 then
  255. bne .Lbyteloop // DoForwardByteCopy;
  256. .Ldwordloop:
  257. ldmia r0!, {r3, ip}
  258. sub r2,r2,#8
  259. cmp r2, #8
  260. stmia r1!, {r3, ip}
  261. bge .Ldwordloop
  262. cmp r2,#0
  263. {$ifdef CPUARM_HAS_BX}
  264. bxeq lr
  265. {$else}
  266. moveq pc,lr
  267. {$endif}
  268. .Lbyteloop:
  269. subs r2,r2,#1
  270. ldrb r3,[r0],#1
  271. strb r3,[r1],#1
  272. bne .Lbyteloop
  273. {$ifdef CPUARM_HAS_BX}
  274. bx lr
  275. {$else}
  276. mov pc,lr
  277. {$endif}
  278. .Loverlapped:
  279. subs r2,r2,#1
  280. ldrb r3,[r0,r2]
  281. strb r3,[r1,r2]
  282. bne .Loverlapped
  283. end;
  284. const
  285. moveproc : procedure(const source;var dest;count:longint) = @move_blended;
  286. procedure Move(const source;var dest;count:longint);[public, alias: 'FPC_MOVE']; {$ifndef FPC_PIC} assembler;nostackframe; {$endif FPC_PIC}
  287. {$ifdef FPC_PIC}
  288. begin
  289. moveproc(source,dest,count);
  290. end;
  291. {$else FPC_PIC}
  292. asm
  293. ldr ip,.Lmoveproc
  294. ldr pc,[ip]
  295. .Lmoveproc:
  296. .long moveproc
  297. end;
  298. {$endif FPC_PIC}
  299. {$endif CPUARM_HAS_EDSP}
  300. {$endif FPC_SYSTEM_HAS_MOVE}
  301. {****************************************************************************
  302. String
  303. ****************************************************************************}
  304. {$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  305. {$define FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  306. procedure fpc_shortstr_to_shortstr(out res:shortstring;const sstr:shortstring);assembler;nostackframe;[public,alias: 'FPC_SHORTSTR_TO_SHORTSTR'];compilerproc;
  307. {r0: __RESULT
  308. r1: len
  309. r2: sstr}
  310. asm
  311. ldrb r12,[r2],#1
  312. cmp r12,r1
  313. movgt r12,r1
  314. strb r12,[r0],#1
  315. cmp r12,#6 (* 6 seems to be the break even point. *)
  316. blt .LStartTailCopy
  317. (* Align destination on 32bits. This is the only place where unrolling
  318. really seems to help, since in the common case, sstr is aligned on
  319. 32 bits, therefore in the common case we need to copy 3 bytes to
  320. align, i.e. in the case of a loop, you wouldn't branch out early.*)
  321. rsb r3,r0,#0
  322. ands r3,r3,#3
  323. sub r12,r12,r3
  324. ldrneb r1,[r2],#1
  325. strneb r1,[r0],#1
  326. subnes r3,r3,#1
  327. ldrneb r1,[r2],#1
  328. strneb r1,[r0],#1
  329. subnes r3,r3,#1
  330. ldrneb r1,[r2],#1
  331. strneb r1,[r0],#1
  332. subnes r3,r3,#1
  333. .LDoneAlign:
  334. (* Destination should be aligned now, but source might not be aligned,
  335. if this is the case, do a byte-per-byte copy. *)
  336. tst r2,#3
  337. bne .LStartTailCopy
  338. (* Start the main copy, 32 bit at a time. *)
  339. movs r3,r12,lsr #2
  340. and r12,r12,#3
  341. beq .LStartTailCopy
  342. .LNext4bytes:
  343. (* Unrolling this loop would save a little bit of time for long strings
  344. (>20 chars), but alas, it hurts for short strings and they are the
  345. common case.*)
  346. ldrne r1,[r2],#4
  347. strne r1,[r0],#4
  348. subnes r3,r3,#1
  349. bne .LNext4bytes
  350. .LStartTailCopy:
  351. (* Do remaining bytes. *)
  352. cmp r12,#0
  353. beq .LDoneTail
  354. .LNextChar3:
  355. ldrb r1,[r2],#1
  356. strb r1,[r0],#1
  357. subs r12,r12,#1
  358. bne .LNextChar3
  359. .LDoneTail:
  360. end;
  361. procedure fpc_shortstr_assign(len:longint;sstr,dstr:pointer);assembler;nostackframe;[public,alias:'FPC_SHORTSTR_ASSIGN'];compilerproc;
  362. {r0: len
  363. r1: sstr
  364. r2: dstr}
  365. asm
  366. ldrb r12,[r1],#1
  367. cmp r12,r0
  368. movgt r12,r0
  369. strb r12,[r2],#1
  370. cmp r12,#6 (* 6 seems to be the break even point. *)
  371. blt .LStartTailCopy
  372. (* Align destination on 32bits. This is the only place where unrolling
  373. really seems to help, since in the common case, sstr is aligned on
  374. 32 bits, therefore in the common case we need to copy 3 bytes to
  375. align, i.e. in the case of a loop, you wouldn't branch out early.*)
  376. rsb r3,r2,#0
  377. ands r3,r3,#3
  378. sub r12,r12,r3
  379. ldrneb r0,[r1],#1
  380. strneb r0,[r2],#1
  381. subnes r3,r3,#1
  382. ldrneb r0,[r1],#1
  383. strneb r0,[r2],#1
  384. subnes r3,r3,#1
  385. ldrneb r0,[r1],#1
  386. strneb r0,[r2],#1
  387. subnes r3,r3,#1
  388. .LDoneAlign:
  389. (* Destination should be aligned now, but source might not be aligned,
  390. if this is the case, do a byte-per-byte copy. *)
  391. tst r1,#3
  392. bne .LStartTailCopy
  393. (* Start the main copy, 32 bit at a time. *)
  394. movs r3,r12,lsr #2
  395. and r12,r12,#3
  396. beq .LStartTailCopy
  397. .LNext4bytes:
  398. (* Unrolling this loop would save a little bit of time for long strings
  399. (>20 chars), but alas, it hurts for short strings and they are the
  400. common case.*)
  401. ldrne r0,[r1],#4
  402. strne r0,[r2],#4
  403. subnes r3,r3,#1
  404. bne .LNext4bytes
  405. .LStartTailCopy:
  406. (* Do remaining bytes. *)
  407. cmp r12,#0
  408. beq .LDoneTail
  409. .LNextChar3:
  410. ldrb r0,[r1],#1
  411. strb r0,[r2],#1
  412. subs r12,r12,#1
  413. bne .LNextChar3
  414. .LDoneTail:
  415. end;
  416. {$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
  417. {$ifndef FPC_SYSTEM_HAS_FPC_PCHAR_LENGTH}
  418. {$define FPC_SYSTEM_HAS_FPC_PCHAR_LENGTH}
  419. function fpc_Pchar_length(p:Pchar):sizeint;assembler;nostackframe;[public,alias:'FPC_PCHAR_LENGTH'];compilerproc;
  420. asm
  421. cmp r0,#0
  422. mov r1,r0
  423. beq .Ldone
  424. .Lnextchar:
  425. (*Are we aligned?*)
  426. tst r1,#3
  427. bne .Ltest_unaligned (*No, do byte per byte.*)
  428. ldr r3,.L01010101
  429. .Ltest_aligned:
  430. (*Aligned, load 4 bytes at a time.*)
  431. ldr r12,[r1],#4
  432. (*Check wether r12 contains a 0 byte.*)
  433. sub r2,r12,r3
  434. mvn r12,r12
  435. and r2,r2,r12
  436. ands r2,r2,r3,lsl #7 (*r3 lsl 7 = $80808080*)
  437. beq .Ltest_aligned (*No 0 byte, repeat.*)
  438. sub r1,r1,#4
  439. .Ltest_unaligned:
  440. ldrb r12,[r1],#1
  441. cmp r12,#1 (*r12<1 same as r12=0, but result in carry flag*)
  442. bcs .Lnextchar
  443. (*Dirty trick: we need to subtract 1 extra because we have counted the
  444. terminating 0, due to the known carry flag sbc can do this.*)
  445. sbc r0,r1,r0
  446. .Ldone:
  447. {$ifdef CPUARM_HAS_BX}
  448. bx lr
  449. {$else}
  450. mov pc,lr
  451. {$endif}
  452. .L01010101:
  453. .long 0x01010101
  454. end;
  455. {$endif}
  456. {$ifndef darwin}
  457. {$define FPC_SYSTEM_HAS_ANSISTR_DECR_REF}
  458. Procedure fpc_ansistr_decr_ref (Var S : Pointer); [Public,Alias:'FPC_ANSISTR_DECR_REF'];assembler;nostackframe; compilerproc;
  459. asm
  460. ldr r1, [r0]
  461. // On return the pointer will always be set to zero, so utilize the delay slots
  462. mov r2, #0
  463. str r2, [r0]
  464. // Check for a zero string
  465. cmp r1, #0
  466. // Load reference counter
  467. ldrne r2, [r1, #-8]
  468. {$ifdef CPUARM_HAS_BX}
  469. bxeq lr
  470. {$else}
  471. moveq pc,lr
  472. {$endif}
  473. // Check for a constant string
  474. cmp r2, #0
  475. {$ifdef CPUARM_HAS_BX}
  476. bxlt lr
  477. {$else}
  478. movlt pc,lr
  479. {$endif}
  480. stmfd sp!, {r1, lr}
  481. sub r0, r1, #8
  482. {$ifdef CPUARM_HAS_BLX}
  483. blx InterLockedDecrement
  484. {$else}
  485. bl InterLockedDecrement
  486. {$endif}
  487. // InterLockedDecrement is a nice guy and sets the z flag for us
  488. // if the reference count dropped to 0
  489. ldmnefd sp!, {r1, pc}
  490. ldmfd sp!, {r0, lr}
  491. // We currently can not use constant symbols in ARM-Assembly
  492. // but we need to stay backward compatible with 2.6
  493. sub r0, r0, #12
  494. // Jump without a link, so freemem directly returns to our caller
  495. b FPC_FREEMEM
  496. end;
  497. {$define FPC_SYSTEM_HAS_ANSISTR_INCR_REF}
  498. Procedure fpc_ansistr_incr_ref (S : Pointer); [Public,Alias:'FPC_ANSISTR_INCR_REF'];assembler;nostackframe; compilerproc;
  499. asm
  500. // Null string?
  501. cmp r0, #0
  502. // Load reference counter
  503. ldrne r1, [r0, #-8]
  504. // pointer to counter, calculate here for delay slot utilization
  505. subne r0, r0, #8
  506. {$ifdef CPUARM_HAS_BX}
  507. bxeq lr
  508. {$else}
  509. moveq pc,lr
  510. {$endif}
  511. // Check for a constant string
  512. cmp r1, #0
  513. // Tailcall
  514. // Hopefully the linker will place InterLockedIncrement as layed out here
  515. bge InterLockedIncrement
  516. // Freepascal will generate a proper return here, save some cachespace
  517. end;
  518. {$endif not darwin}
  519. // --- InterLocked functions begin
  520. {$if not defined(CPUARM_HAS_LDREX) and not defined(SYSTEM_HAS_KUSER_CMPXCHG) }
  521. // Use generic interlock implementation
  522. var
  523. fpc_system_lock: longint;
  524. {$ifdef FPC_PIC}
  525. // Use generic interlock implementation with PIC
  526. // A helper function to get a pointer to fpc_system_lock in the PIC compatible way.
  527. function get_fpc_system_lock_ptr: pointer;
  528. begin
  529. get_fpc_system_lock_ptr:=@fpc_system_lock;
  530. end;
  531. {$endif FPC_PIC}
  532. {$endif}
  533. function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe;
  534. asm
  535. {$ifdef CPUARM_HAS_LDREX}
  536. .Lloop:
  537. ldrex r1, [r0]
  538. sub r1, r1, #1
  539. strex r2, r1, [r0]
  540. cmp r2, #0
  541. bne .Lloop
  542. movs r0, r1
  543. bx lr
  544. {$else}
  545. {$ifdef SYSTEM_HAS_KUSER_CMPXCHG}
  546. stmfd r13!, {lr}
  547. mov r2, r0 // kuser_cmpxchg does not clobber r2 by definition
  548. .Latomic_dec_loop:
  549. ldr r0, [r2] // Load the current value
  550. // We expect this to work without looping most of the time
  551. // R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
  552. // loop here again, we have to reload the value. Normaly this just fills the
  553. // load stall-cycles from the above ldr so in reality we'll not get any additional
  554. // delays because of this
  555. // Don't use ldr to load r3 to avoid cacheline trashing
  556. // Load 0xffff0fff into r3 and substract to 0xffff0fc0,
  557. // the kuser_cmpxchg entry point
  558. mvn r3, #0x0000f000
  559. sub r3, r3, #0x3F
  560. sub r1, r0, #1 // Decrement value
  561. {$ifdef CPUARM_HAS_BLX}
  562. blx r3 // Call kuser_cmpxchg, sets C-Flag on success
  563. {$else}
  564. mov lr, pc
  565. {$ifdef CPUARM_HAS_BX}
  566. bx r3
  567. {$else}
  568. mov pc, r3
  569. {$endif}
  570. {$endif}
  571. // MOVS sets the Z flag when the result reaches zero, this can be used later on
  572. // The C-Flag will not be modified by this because we're not doing any shifting
  573. movcss r0, r1 // We expect that to work most of the time so keep it pipeline friendly
  574. ldmcsfd r13!, {pc}
  575. b .Latomic_dec_loop // kuser_cmpxchg sets C flag on error
  576. {$else}
  577. // lock
  578. {$ifdef FPC_PIC}
  579. push {r0,lr}
  580. {$ifdef CPUARM_HAS_BLX}
  581. blx get_fpc_system_lock_ptr
  582. {$else}
  583. bl get_fpc_system_lock_ptr
  584. {$endif CPUARM_HAS_BLX}
  585. mov r3,r0
  586. pop {r0,lr}
  587. {$else FPC_PIC}
  588. ldr r3, .Lfpc_system_lock
  589. {$endif FPC_PIC}
  590. mov r1, #1
  591. .Lloop:
  592. swp r2, r1, [r3]
  593. cmp r2, #0
  594. bne .Lloop
  595. // do the job
  596. ldr r1, [r0]
  597. sub r1, r1, #1
  598. str r1, [r0]
  599. movs r0, r1
  600. // unlock and return
  601. str r2, [r3]
  602. {$ifdef CPUARM_HAS_BX}
  603. bx lr
  604. {$else}
  605. mov pc,lr
  606. {$endif}
  607. {$ifndef FPC_PIC}
  608. .Lfpc_system_lock:
  609. .long fpc_system_lock
  610. {$endif FPC_PIC}
  611. {$endif}
  612. {$endif}
  613. end;
  614. function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe;
  615. asm
  616. {$ifdef CPUARM_HAS_LDREX}
  617. .Lloop:
  618. ldrex r1, [r0]
  619. add r1, r1, #1
  620. strex r2, r1, [r0]
  621. cmp r2, #0
  622. bne .Lloop
  623. mov r0, r1
  624. bx lr
  625. {$else}
  626. {$ifdef SYSTEM_HAS_KUSER_CMPXCHG}
  627. stmfd r13!, {lr}
  628. mov r2, r0 // kuser_cmpxchg does not clobber r2 by definition
  629. .Latomic_inc_loop:
  630. ldr r0, [r2] // Load the current value
  631. // We expect this to work without looping most of the time
  632. // R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
  633. // loop here again, we have to reload the value. Normaly this just fills the
  634. // load stall-cycles from the above ldr so in reality we'll not get any additional
  635. // delays because of this
  636. // Don't use ldr to load r3 to avoid cacheline trashing
  637. // Load 0xffff0fff into r3 and substract to 0xffff0fc0,
  638. // the kuser_cmpxchg entry point
  639. mvn r3, #0x0000f000
  640. sub r3, r3, #0x3F
  641. add r1, r0, #1 // Increment value
  642. {$ifdef CPUARM_HAS_BLX}
  643. blx r3 // Call kuser_cmpxchg, sets C-Flag on success
  644. {$else}
  645. mov lr, pc
  646. {$ifdef CPUARM_HAS_BX}
  647. bx r3
  648. {$else}
  649. mov pc, r3
  650. {$endif}
  651. {$endif}
  652. movcs r0, r1 // We expect that to work most of the time so keep it pipeline friendly
  653. ldmcsfd r13!, {pc}
  654. b .Latomic_inc_loop // kuser_cmpxchg sets C flag on error
  655. {$else}
  656. // lock
  657. {$ifdef FPC_PIC}
  658. push {r0,lr}
  659. {$ifdef CPUARM_HAS_BLX}
  660. blx get_fpc_system_lock_ptr
  661. {$else}
  662. bl get_fpc_system_lock_ptr
  663. {$endif CPUARM_HAS_BLX}
  664. mov r3,r0
  665. pop {r0,lr}
  666. {$else FPC_PIC}
  667. ldr r3, .Lfpc_system_lock
  668. {$endif FPC_PIC}
  669. mov r1, #1
  670. .Lloop:
  671. swp r2, r1, [r3]
  672. cmp r2, #0
  673. bne .Lloop
  674. // do the job
  675. ldr r1, [r0]
  676. add r1, r1, #1
  677. str r1, [r0]
  678. mov r0, r1
  679. // unlock and return
  680. str r2, [r3]
  681. {$ifdef CPUARM_HAS_BX}
  682. bx lr
  683. {$else}
  684. mov pc,lr
  685. {$endif}
  686. {$ifndef FPC_PIC}
  687. .Lfpc_system_lock:
  688. .long fpc_system_lock
  689. {$endif FPC_PIC}
  690. {$endif}
  691. {$endif}
  692. end;
  693. function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  694. asm
  695. {$ifdef CPUARM_HAS_LDREX}
  696. // swp is deprecated on ARMv6 and above
  697. .Lloop:
  698. ldrex r2, [r0]
  699. strex r3, r1, [r0]
  700. cmp r3, #0
  701. bne .Lloop
  702. mov r0, r2
  703. bx lr
  704. {$else}
  705. {$ifdef SYSTEM_HAS_KUSER_CMPXCHG}
  706. stmfd r13!, {r4, lr}
  707. mov r2, r0 // kuser_cmpxchg does not clobber r2 (and r1) by definition
  708. .Latomic_add_loop:
  709. ldr r0, [r2] // Load the current value
  710. // We expect this to work without looping most of the time
  711. // R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
  712. // loop here again, we have to reload the value. Normaly this just fills the
  713. // load stall-cycles from the above ldr so in reality we'll not get any additional
  714. // delays because of this
  715. // Don't use ldr to load r3 to avoid cacheline trashing
  716. // Load 0xffff0fff into r3 and substract to 0xffff0fc0,
  717. // the kuser_cmpxchg entry point
  718. mvn r3, #0x0000f000
  719. sub r3, r3, #0x3F
  720. mov r4, r0 // save the current value because kuser_cmpxchg clobbers r0
  721. {$ifdef CPUARM_HAS_BLX}
  722. blx r3 // Call kuser_cmpxchg, sets C-Flag on success
  723. {$else}
  724. mov lr, pc
  725. {$ifdef CPUARM_HAS_BX}
  726. bx r3
  727. {$else}
  728. mov pc, r3
  729. {$endif}
  730. {$endif}
  731. // restore the original value if needed
  732. movcs r0, r4
  733. ldmcsfd r13!, {r4, pc}
  734. b .Latomic_add_loop // kuser_cmpxchg failed, loop back
  735. {$else}
  736. // lock
  737. {$ifdef FPC_PIC}
  738. push {r0,r1,lr}
  739. {$ifdef CPUARM_HAS_BLX}
  740. blx get_fpc_system_lock_ptr
  741. {$else}
  742. bl get_fpc_system_lock_ptr
  743. {$endif CPUARM_HAS_BLX}
  744. mov r3,r0
  745. pop {r0,r1,lr}
  746. {$else FPC_PIC}
  747. ldr r3, .Lfpc_system_lock
  748. {$endif FPC_PIC}
  749. mov r2, #1
  750. .Lloop:
  751. swp r2, r2, [r3]
  752. cmp r2, #0
  753. bne .Lloop
  754. // do the job
  755. ldr r2, [r0]
  756. str r1, [r0]
  757. mov r0, r2
  758. // unlock and return
  759. mov r2, #0
  760. str r2, [r3]
  761. {$ifdef CPUARM_HAS_BX}
  762. bx lr
  763. {$else}
  764. mov pc,lr
  765. {$endif}
  766. {$ifndef FPC_PIC}
  767. .Lfpc_system_lock:
  768. .long fpc_system_lock
  769. {$endif FPC_PIC}
  770. {$endif}
  771. {$endif}
  772. end;
  773. function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe;
  774. asm
  775. {$ifdef CPUARM_HAS_LDREX}
  776. .Lloop:
  777. ldrex r2, [r0]
  778. add r12, r1, r2
  779. strex r3, r12, [r0]
  780. cmp r3, #0
  781. bne .Lloop
  782. mov r0, r2
  783. bx lr
  784. {$else}
  785. {$ifdef SYSTEM_HAS_KUSER_CMPXCHG}
  786. stmfd r13!, {r4, lr}
  787. mov r2, r0 // kuser_cmpxchg does not clobber r2 by definition
  788. mov r4, r1 // Save addend
  789. .Latomic_add_loop:
  790. ldr r0, [r2] // Load the current value
  791. // We expect this to work without looping most of the time
  792. // R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
  793. // loop here again, we have to reload the value. Normaly this just fills the
  794. // load stall-cycles from the above ldr so in reality we'll not get any additional
  795. // delays because of this
  796. // Don't use ldr to load r3 to avoid cacheline trashing
  797. // Load 0xffff0fff into r3 and substract to 0xffff0fc0,
  798. // the kuser_cmpxchg entry point
  799. mvn r3, #0x0000f000
  800. sub r3, r3, #0x3F
  801. add r1, r0, r4 // Add to value
  802. {$ifdef CPUARM_HAS_BLX}
  803. blx r3 // Call kuser_cmpxchg, sets C-Flag on success
  804. {$else}
  805. mov lr, pc
  806. {$ifdef CPUARM_HAS_BX}
  807. bx r3
  808. {$else}
  809. mov pc, r3
  810. {$endif}
  811. {$endif}
  812. // r1 does not get clobbered, so just get back the original value
  813. // Otherwise we would have to allocate one more register and store the
  814. // temporary value
  815. subcs r0, r1, r4
  816. ldmcsfd r13!, {r4, pc}
  817. b .Latomic_add_loop // kuser_cmpxchg failed, loop back
  818. {$else}
  819. // lock
  820. {$ifdef FPC_PIC}
  821. push {r0,r1,lr}
  822. {$ifdef CPUARM_HAS_BLX}
  823. blx get_fpc_system_lock_ptr
  824. {$else}
  825. bl get_fpc_system_lock_ptr
  826. {$endif CPUARM_HAS_BLX}
  827. mov r3,r0
  828. pop {r0,r1,lr}
  829. {$else FPC_PIC}
  830. ldr r3, .Lfpc_system_lock
  831. {$endif FPC_PIC}
  832. mov r2, #1
  833. .Lloop:
  834. swp r2, r2, [r3]
  835. cmp r2, #0
  836. bne .Lloop
  837. // do the job
  838. ldr r2, [r0]
  839. add r1, r1, r2
  840. str r1, [r0]
  841. mov r0, r2
  842. // unlock and return
  843. mov r2, #0
  844. str r2, [r3]
  845. {$ifdef CPUARM_HAS_BX}
  846. bx lr
  847. {$else}
  848. mov pc,lr
  849. {$endif}
  850. {$ifndef FPC_PIC}
  851. .Lfpc_system_lock:
  852. .long fpc_system_lock
  853. {$endif FPC_PIC}
  854. {$endif}
  855. {$endif}
  856. end;
  857. function InterlockedCompareExchange(var Target: longint; NewValue: longint; Comperand: longint): longint; assembler; nostackframe;
  858. asm
  859. {$ifdef CPUARM_HAS_LDREX}
  860. .Lloop:
  861. ldrex r3, [r0]
  862. mov r12, #0
  863. cmp r3, r2
  864. strexeq r12, r1, [r0]
  865. cmp r12, #0
  866. bne .Lloop
  867. mov r0, r3
  868. bx lr
  869. {$else}
  870. {$ifdef SYSTEM_HAS_KUSER_CMPXCHG}
  871. stmfd r13!, {r4, lr}
  872. mov r4, r2 // Swap parameters around
  873. mov r2, r0
  874. mov r0, r4 // Use r4 because we'll need the new value for later
  875. // r1 and r2 will not be clobbered by kuser_cmpxchg
  876. // If we have to loop, r0 will be set to the original Comperand
  877. // kuser_cmpxchg is documented to destroy r3, therefore setting
  878. // r3 must be in the loop
  879. .Linterlocked_compare_exchange_loop:
  880. mvn r3, #0x0000f000
  881. sub r3, r3, #0x3F
  882. {$ifdef CPUARM_HAS_BLX}
  883. blx r3 // Call kuser_cmpxchg, sets C-Flag on success
  884. {$else}
  885. mov lr, pc
  886. {$ifdef CPUARM_HAS_BX}
  887. bx r3
  888. {$else}
  889. mov pc, r3
  890. {$endif}
  891. {$endif}
  892. movcs r0, r4 // Return the previous value on success
  893. ldmcsfd r13!, {r4, pc}
  894. // The error case is a bit tricky, kuser_cmpxchg does not return the current value
  895. // So we may need to loop to avoid race conditions
  896. // The loop case is HIGHLY unlikely, it would require that we got rescheduled between
  897. // calling kuser_cmpxchg and the ldr. While beeing rescheduled another process/thread
  898. // would have the set the value to our comperand
  899. ldr r0, [r2] // Load the currently set value
  900. cmp r0, r4 // Return if Comperand != current value, otherwise loop again
  901. ldmnefd r13!, {r4, pc}
  902. // If we need to loop here, we have to
  903. b .Linterlocked_compare_exchange_loop
  904. {$else}
  905. // lock
  906. {$ifdef FPC_PIC}
  907. push {r0,r1,r2,lr}
  908. {$ifdef CPUARM_HAS_BLX}
  909. blx get_fpc_system_lock_ptr
  910. {$else}
  911. bl get_fpc_system_lock_ptr
  912. {$endif CPUARM_HAS_BLX}
  913. mov r12,r0
  914. pop {r0,r1,r2,lr}
  915. {$else FPC_PIC}
  916. ldr r12, .Lfpc_system_lock
  917. {$endif FPC_PIC}
  918. mov r3, #1
  919. .Lloop:
  920. swp r3, r3, [r12]
  921. cmp r3, #0
  922. bne .Lloop
  923. // do the job
  924. ldr r3, [r0]
  925. cmp r3, r2
  926. streq r1, [r0]
  927. mov r0, r3
  928. // unlock and return
  929. mov r3, #0
  930. str r3, [r12]
  931. {$ifdef CPUARM_HAS_BX}
  932. bx lr
  933. {$else}
  934. mov pc,lr
  935. {$endif}
  936. {$ifndef FPC_PIC}
  937. .Lfpc_system_lock:
  938. .long fpc_system_lock
  939. {$endif FPC_PIC}
  940. {$endif}
  941. {$endif}
  942. end;
  943. {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
  944. function declocked(var l: longint) : boolean; inline;
  945. begin
  946. Result:=InterLockedDecrement(l) = 0;
  947. end;
  948. {$define FPC_SYSTEM_HAS_INCLOCKED_LONGINT}
  949. procedure inclocked(var l: longint); inline;
  950. begin
  951. InterLockedIncrement(l);
  952. end;
  953. // --- InterLocked functions end
  954. procedure fpc_cpucodeinit;
  955. begin
  956. {$ifdef FPC_SYSTEM_FPC_MOVE}
  957. {$ifndef CPUARM_HAS_EDSP}
  958. cpu_has_edsp:=true;
  959. in_edsp_test:=true;
  960. asm
  961. bic r0,sp,#7
  962. ldrd r0,r1,[r0]
  963. end;
  964. in_edsp_test:=false;
  965. if cpu_has_edsp then
  966. moveproc:=@move_pld
  967. else
  968. moveproc:=@move_blended;
  969. {$else CPUARM_HAS_EDSP}
  970. cpu_has_edsp:=true;
  971. {$endif CPUARM_HAS_EDSP}
  972. {$endif FPC_SYSTEM_FPC_MOVE}
  973. end;
  974. {$define FPC_SYSTEM_HAS_SWAPENDIAN}
  975. { SwapEndian(<16 Bit>) being inlined is faster than using assembler }
  976. function SwapEndian(const AValue: SmallInt): SmallInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  977. begin
  978. { the extra Word type cast is necessary because the "AValue shr 8" }
  979. { is turned into "longint(AValue) shr 8", so if AValue < 0 then }
  980. { the sign bits from the upper 16 bits are shifted in rather than }
  981. { zeroes. }
  982. Result := SmallInt((Word(AValue) shr 8) or (Word(AValue) shl 8));
  983. end;
  984. function SwapEndian(const AValue: Word): Word;{$ifdef SYSTEMINLINE}inline;{$endif}
  985. begin
  986. Result := Word((AValue shr 8) or (AValue shl 8));
  987. end;
  988. (*
  989. This is kept for reference. Thats what the compiler COULD generate in these cases.
  990. But FPC currently does not support inlining of asm-functions, so the whole call-overhead
  991. is bigger than the gain of the optimized function.
  992. function AsmSwapEndian(const AValue: SmallInt): SmallInt;{$ifdef SYSTEMINLINE}inline;{$endif};assembler;nostackframe;
  993. asm
  994. // We're starting with 4321
  995. {$if defined(CPUARM_HAS_REV)}
  996. rev r0, r0 // Reverse byteorder r0 = 1234
  997. mov r0, r0, shr #16 // Shift down to 16bits r0 = 0012
  998. {$else}
  999. mov r0, r0, shl #16 // Shift to make that 2100
  1000. mov r0, r0, ror #24 // Rotate to 1002
  1001. orr r0, r0, r0 shr #16 // Shift and combine into 0012
  1002. {$endif}
  1003. end;
  1004. *)
  1005. {
  1006. These used to be an assembler-function, but with newer improvements to the compiler this
  1007. generates a perfect 4 cycle code sequence and can be inlined.
  1008. }
  1009. function SwapEndian(const AValue: LongWord): LongWord;{$ifdef SYSTEMINLINE}inline;{$endif}
  1010. begin
  1011. Result:= AValue xor rordword(AValue,16);
  1012. Result:= Result and $FF00FFFF;
  1013. Result:= (Result shr 8) xor rordword(AValue,8);
  1014. end;
  1015. function SwapEndian(const AValue: LongInt): LongInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  1016. begin
  1017. Result:=LongInt(SwapEndian(DWord(AValue)));
  1018. end;
  1019. {
  1020. Currently freepascal will not generate a good assembler sequence for
  1021. Result:=(SwapEndian(longword(lo(AValue))) shl 32) or
  1022. (SwapEndian(longword(hi(AValue))));
  1023. So we keep an assembly version for now
  1024. }
  1025. function SwapEndian(const AValue: Int64): Int64; assembler; nostackframe;
  1026. asm
  1027. // fpc >2.6.0 adds the "rev" instruction in the internal assembler
  1028. {$if defined(CPUARM_HAS_REV)}
  1029. rev r2, r0
  1030. rev r0, r1
  1031. mov r1, r2
  1032. {$else}
  1033. mov ip, r1
  1034. // We're starting with r0 = $87654321
  1035. eor r1, r0, r0, ror #16 // r1 = $C444C444
  1036. bic r1, r1, #16711680 // r1 = r1 and $ff00ffff = $C400C444
  1037. mov r0, r0, ror #8 // r0 = $21876543
  1038. eor r1, r0, r1, lsr #8 // r1 = $21436587
  1039. eor r0, ip, ip, ror #16
  1040. bic r0, r0, #16711680
  1041. mov ip, ip, ror #8
  1042. eor r0, ip, r0, lsr #8
  1043. {$endif}
  1044. end;
  1045. function SwapEndian(const AValue: QWord): QWord; {$ifdef SYSTEMINLINE}inline;{$endif}
  1046. begin
  1047. Result:=QWord(SwapEndian(Int64(AValue)));
  1048. end;
  1049. {$ifndef FPC_SYSTEM_HAS_MEM_BARRIER}
  1050. {$define FPC_SYSTEM_HAS_MEM_BARRIER}
  1051. { Generic read/readwrite barrier code. }
  1052. procedure barrier; assembler; nostackframe;
  1053. asm
  1054. // manually encode the instructions to avoid bootstrap and -march external
  1055. // assembler settings
  1056. {$ifdef CPUARM_HAS_DMB}
  1057. .long 0xf57ff05f // dmb sy
  1058. {$else}
  1059. {$ifdef CPUARMV6}
  1060. mov r0, #0
  1061. .long 0xee070fba // mcr 15, 0, r0, cr7, cr10, {5}
  1062. {$endif}
  1063. {$endif}
  1064. end;
  1065. procedure ReadBarrier;{$ifdef SYSTEMINLINE}inline;{$endif}
  1066. begin
  1067. barrier;
  1068. end;
  1069. procedure ReadDependencyBarrier;{$ifdef SYSTEMINLINE}inline;{$endif}
  1070. begin
  1071. { reads imply barrier on earlier reads depended on; not required on ARM }
  1072. end;
  1073. procedure ReadWriteBarrier;{$ifdef SYSTEMINLINE}inline;{$endif}
  1074. begin
  1075. barrier;
  1076. end;
  1077. procedure WriteBarrier; assembler; nostackframe;
  1078. asm
  1079. // specialize the write barrier because according to ARM, implementations for
  1080. // "dmb st" may be more optimal than the more generic "dmb sy"
  1081. {$ifdef CPUARM_HAS_DMB}
  1082. .long 0xf57ff05e // dmb st
  1083. {$else}
  1084. {$ifdef CPUARMV6}
  1085. mov r0, #0
  1086. .long 0xee070fba // mcr 15, 0, r0, cr7, cr10, {5}
  1087. {$endif}
  1088. {$endif}
  1089. end;
  1090. {$endif}
  1091. {include hand-optimized assembler division code}
  1092. {$i divide.inc}