Browse Source

Work around load latency in InterlockedExchange for ARM

An LDR will have two load latency cycles on most ARM implementations,
moving the
  mov r4, r0
two instructions away from the corresponding ldr will avoid the stalls.

git-svn-id: trunk@22107 -
masta 13 years ago
parent
commit
6729164fcc
1 changed files with 1 additions and 1 deletions
  1. 1 1
      rtl/arm/arm.inc

+ 1 - 1
rtl/arm/arm.inc

@@ -713,7 +713,6 @@ asm
   mov r2, r0   // kuser_cmpxchg does not clobber r2 (and r1) by definition
 .Latomic_add_loop:
   ldr r0, [r2]   // Load the current value
-  mov r4, r0     // save the current value because kuser_cmpxchg clobbers r0
 
   // We expect this to work without looping most of the time
   // R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
@@ -725,6 +724,7 @@ asm
   // the kuser_cmpxchg entry point
   mvn r3, #0x0000f000
   sub r3, r3, #0x3F
+  mov r4, r0     // save the current value because kuser_cmpxchg clobbers r0
 
   blx r3	 // Call kuser_cmpxchg, sets C-Flag on success
   // restore the original value if needed