Browse Source

--- Merging r20491 into '.':
U rtl/arm/arm.inc
--- Merging r20510 into '.':
G rtl/arm/arm.inc
--- Merging r20514 into '.':
G rtl/arm/arm.inc
U tests/test/units/system/interlocked1.pp

# revisions: 20491,20510,20514
r20491 | florian | 2012-03-10 12:33:20 +0100 (Sat, 10 Mar 2012) | 47 lines
Changed paths:
M /trunk/rtl/arm/arm.inc

o patch by Nico Erfurth: Better Locked* implementation for arm on linux

The following functions where changed to make use of the kernel helper
kuser_cmpxchg:
InterLockedDecrement
InterLockedIncrement
InterLockedExchangeAdd
InterLockedCompareExchange

The previous implementation using a spinlock had a couple of drawbacks:
1.) The functions could not be used safely on values not completly managed
by the process itself, because the spinlock did not protect data but the
functions. For example, think about two processes using shared memory.
They would not be able to share fpc_system_lock, making it unsafe to use
these functions.
2.) With many active threads, there was a high chance that the scheduler
would interrupt a thread while fpc_system_lock was taken, which would
result in the following threads using one of these functions to spinlock till
the end of its timeslice. This could result in unwanted and unnecessary
latencies.
3.) Every function contained a pointer to fpc_system_lock. Resulting in
two polluted DCache-Lines per call and possible latencies through dcache
misses.

The new implementation only works on Linux Kernel >= 2.6.16
The functions are implemented in a way which tries to minimize cache pollution
and load latencies.

Even without Multithreading the new functions are a lot faster. I've did
comparisons on my Kirkwood 1.2GHz with the following template code:

var X: longint;
begin
X := 0;
while X < longint(100*1000000) do
FUNCTION(X);
Writeln(X);
end.

Function New Old
InterLockedIncrement: 0m3.696s 0m23.220s
InterLockedExchangeAdd: 0m4.034s 0m23.242s
InterLockedCompareExchange: 0m4.703s 0m24.006s

This speedup is most probably because of the reduced memory access,
which resulted in lots of cache misses.
r20510 | florian | 2012-03-11 08:38:21 +0100 (Sun, 11 Mar 2012) | 2 lines
Changed paths:
M /trunk/rtl/arm/arm.inc

* comitted wrong patch in r20491, fixed with this revision
r20514 | florian | 2012-03-11 22:08:57 +0100 (Sun, 11 Mar 2012) | 20 lines
Changed paths:
M /trunk/rtl/arm/arm.inc
M /trunk/tests/test/units/system/interlocked1.pp

o patch by Nico Erfurth:

* Fix for InterLockedCompareExchange on ARMEL

InterLockedCompareExchange would not return the current data on failure.
Getting this to work correctly is a bit tricky. As kuser_cmpxchg does
not return the set value, we have to load it.
There is a tiny chance that we get rescheduled between calling
kuser_cmpxchg and loading the value. If the value changed in between
there is the possibility that we would return the Comperand without
having done an actual swap. Which might cause havoc and destruction.

So, if the exchange failed, compare the value and loop again in case
of CurrentValue == Comperand.

* Improve testing of InterLockedCompareExchange

Added a test to check for the case when Comperand is different from the
current value.

git-svn-id: branches/fixes_2_6@22517 -

marco 13 years ago
parent
commit
216005e2e9
2 changed files with 125 additions and 1 deletions
  1. 115 0
      rtl/arm/arm.inc
  2. 10 1
      tests/test/units/system/interlocked1.pp

+ 115 - 0
rtl/arm/arm.inc

@@ -561,6 +561,32 @@ asm
   mov r0, r1
   bx  lr
 {$else}
+{$if defined(LINUX) and defined(CPUARMEL)}
+
+  stmfd r13!, {lr}
+  mov r2, r0   // kuser_cmpxchg does not clobber r2 by definition
+.Latomic_dec_loop:
+  ldr r0, [r2]   // Load the current value
+
+  // We expect this to work without looping most of the time
+  // R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
+  // loop here again, we have to reload the value. Normaly this just fills the
+  // load stall-cycles from the above ldr so in reality we'll not get any additional
+  // delays because of this
+  // Don't use ldr to load r3 to avoid cacheline trashing
+  // Load 0xffff0fff into r3 and substract to 0xffff0fc0,
+  // the kuser_cmpxchg entry point
+  mvn r3, #0x0000f000
+  sub r3, r3, #0x3F
+
+  sub r1, r0, #1 // Decrement value
+  blx r3	 // Call kuser_cmpxchg, sets C-Flag on success
+
+  movcs r0, r1	 // We expect that to work most of the time so keep it pipeline friendly
+  ldmcsfd r13!, {pc}
+  b .Latomic_dec_loop // kuser_cmpxchg sets C flag on error
+
+{$else}
 // lock
   ldr r3, .Lfpc_system_lock
   mov r1, #1
@@ -580,6 +606,7 @@ asm
 .Lfpc_system_lock:
   .long fpc_system_lock
 {$endif}
+{$endif}
 end;
 
 
@@ -595,6 +622,32 @@ asm
   mov r0, r1
   bx  lr
 {$else}
+{$if defined(LINUX) and defined(CPUARMEL)}
+
+  stmfd r13!, {lr}
+  mov r2, r0   // kuser_cmpxchg does not clobber r2 by definition
+.Latomic_inc_loop:
+  ldr r0, [r2]   // Load the current value
+
+  // We expect this to work without looping most of the time
+  // R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
+  // loop here again, we have to reload the value. Normaly this just fills the
+  // load stall-cycles from the above ldr so in reality we'll not get any additional
+  // delays because of this
+  // Don't use ldr to load r3 to avoid cacheline trashing
+  // Load 0xffff0fff into r3 and substract to 0xffff0fc0,
+  // the kuser_cmpxchg entry point
+  mvn r3, #0x0000f000
+  sub r3, r3, #0x3F
+
+  add r1, r0, #1 // Increment value
+  blx r3	 // Call kuser_cmpxchg, sets C-Flag on success
+
+  movcs r0, r1	 // We expect that to work most of the time so keep it pipeline friendly
+  ldmcsfd r13!, {pc}
+  b .Latomic_inc_loop // kuser_cmpxchg sets C flag on error
+
+{$else}
 // lock
   ldr r3, .Lfpc_system_lock
   mov r1, #1
@@ -614,6 +667,7 @@ asm
 .Lfpc_system_lock:
   .long fpc_system_lock
 {$endif}
+{$endif}
 end;
 
 
@@ -646,6 +700,36 @@ asm
   mov  r0, r2
   bx  lr
 {$else}
+{$if defined(LINUX) and defined(CPUARMEL)}
+
+  stmfd r13!, {r4, lr}
+  mov r2, r0   // kuser_cmpxchg does not clobber r2 by definition
+  mov r4, r1   // Save addend
+.Latomic_add_loop:
+  ldr r0, [r2]   // Load the current value
+
+  // We expect this to work without looping most of the time
+  // R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
+  // loop here again, we have to reload the value. Normaly this just fills the
+  // load stall-cycles from the above ldr so in reality we'll not get any additional
+  // delays because of this
+  // Don't use ldr to load r3 to avoid cacheline trashing
+  // Load 0xffff0fff into r3 and substract to 0xffff0fc0,
+  // the kuser_cmpxchg entry point
+  mvn r3, #0x0000f000
+  sub r3, r3, #0x3F
+
+  add r1, r0, r4 // Add to value
+  blx r3	 // Call kuser_cmpxchg, sets C-Flag on success
+  // r1 does not get clobbered, so just get back the original value
+  // Otherwise we would have to allocate one more register and store the
+  // temporary value
+  subcs   r0, r1, r4
+  ldmcsfd r13!, {r4, pc}
+
+  b .Latomic_add_loop // kuser_cmpxchg failed, loop back
+
+{$else}
 // lock
   ldr r3, .Lfpc_system_lock
   mov r2, #1
@@ -666,6 +750,7 @@ asm
 .Lfpc_system_lock:
   .long fpc_system_lock
 {$endif}
+{$endif}
 end;
 
 
@@ -682,6 +767,35 @@ asm
   mov      r0, r3
   bx       lr
 {$else}
+{$if defined(LINUX) and defined(CPUARMEL)}
+
+  stmfd r13!, {r4, lr}
+  mvn   r3, #0x0000f000
+  sub   r3, r3, #0x3F
+
+  mov   r4, r2 // Swap parameters around
+  mov   r2, r0
+  mov   r0, r4 // Use r4 because we'll need the new value for later
+
+  // r1 and r2 will not be clobbered by kuser_cmpxchg
+  // If we have to loop, r0 will be set to the original Comperand
+  .Linterlocked_compare_exchange_loop:
+
+  blx   r3       // Call kuser_cmpxchg sets C-Flag on success
+  movcs r0, r4   // Return the previous value on success
+  ldmcsfd r13!, {r4, pc}
+  // The error case is a bit tricky, kuser_cmpxchg does not return the current value
+  // So we may need to loop to avoid race conditions
+  // The loop case is HIGHLY unlikely, it would require that we got rescheduled between
+  // calling kuser_cmpxchg and the ldr. While beeing rescheduled another process/thread
+  // would have the set the value to our comperand
+  ldr	r0, [r2] // Load the currently set value
+  cmp   r0, r4   // Return if Comperand != current value, otherwise loop again
+  ldmnefd r13!, {r4, pc}
+  // If we need to loop here, we have to
+  b	.Linterlocked_compare_exchange_loop
+
+{$else}
 // lock
   ldr r12, .Lfpc_system_lock
   mov r3, #1
@@ -702,6 +816,7 @@ asm
 .Lfpc_system_lock:
   .long fpc_system_lock
 {$endif}
+{$endif}
 end;
 
 {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}

+ 10 - 1
tests/test/units/system/interlocked1.pp

@@ -7,9 +7,18 @@ begin
   InterLockedCompareExchange(target,4321,1234);
   if target<>4321 then
     halt(1);
+
   ctarget:=1234;
   InterLockedCompareExchange(ctarget,4321,1234);
   if ctarget<>4321 then
-    halt(1);
+    halt(2);
+
+  // Test what happens if we use a comparend which is NOT currently set
+  target := 12345;
+  if(InterLockedCompareExchange(target, 54321, 123) <> 12345) then
+    halt(3);
+  if target<>12345 then
+    halt(4);
+
   writeln('ok');
 end.