o patch by Nico Erfurth: Better Locked* implementation for arm on linux

The following functions where changed to make use of the kernel helper kuser_cmpxchg: InterLockedDecrement InterLockedIncrement InterLockedExchangeAdd InterLockedCompareExchange The previous implementation using a spinlock had a couple of drawbacks: 1.) The functions could not be used safely on values not completly managed by the process itself, because the spinlock did not protect data but the functions. For example, think about two processes using shared memory. They would not be able to share fpc_system_lock, making it unsafe to use these functions. 2.) With many active threads, there was a high chance that the scheduler would interrupt a thread while fpc_system_lock was taken, which would result in the following threads using one of these functions to spinlock till the end of its timeslice. This could result in unwanted and unnecessary latencies. 3.) Every function contained a pointer to fpc_system_lock. Resulting in two polluted DCache-Lines per call and possible latencies through dcache misses. The new implementation only works on Linux Kernel >= 2.6.16 The functions are implemented in a way which tries to minimize cache pollution and load latencies. Even without Multithreading the new functions are a lot faster. I've did comparisons on my Kirkwood 1.2GHz with the following template code: var X: longint; begin X := 0; while X < longint(100*1000000) do FUNCTION(X); Writeln(X); end. Function New Old InterLockedIncrement: 0m3.696s 0m23.220s InterLockedExchangeAdd: 0m4.034s 0m23.242s InterLockedCompareExchange: 0m4.703s 0m24.006s This speedup is most probably because of the reduced memory access, which resulted in lots of cache misses. git-svn-id: trunk@20491 -
2025-11-04 07:59:34 +01:00 · 2012-03-10 11:33:20 +00:00 · 2012-03-10 11:33:20 +00:00 · 5b03826549
commit 5b03826549
parent 8c86455965
1 changed files with 100 additions and 0 deletions
--- a/rtl/arm/arm.inc
+++ b/rtl/arm/arm.inc
@ -561,6 +561,32 @@ asm
  mov r0, r1
  bx  lr
 {$else}
+{$if defined(linux)}
+
+  stmfd r13!, {lr}
+  mov r2, r0   // kuser_cmpxchg does not clobber r2 by definition
+.Latomic_dec_loop:
+  ldr r0, [r2]   // Load the current value
+
+  // We expect this to work without looping most of the time
+  // R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
+  // loop here again, we have to reload the value. Normaly this just fills the
+  // load stall-cycles from the above ldr so in reality we'll not get any additional
+  // delays because of this
+  // Don't use ldr to load r3 to avoid cacheline trashing
+  // Load 0xffff0fff into r3 and substract to 0xffff0fc0,
+  // the kuser_cmpxchg entry point
+  mvn r3, #0x0000f000
+  sub r3, r3, #0x3F
+
+  sub r1, r0, #1 // Decrement value
+  blx r3	 // Call kuser_cmpxchg, sets C-Flag on success
+
+  movcs r0, r1	 // We expect that to work most of the time so keep it pipeline friendly
+  ldmcsfd r13!, {pc}
+  b .Latomic_dec_loop // kuser_cmpxchg sets C flag on error
+
+{$else}
 // lock
  ldr r3, .Lfpc_system_lock
  mov r1, #1
@ -580,6 +606,7 @@ asm
 .Lfpc_system_lock:
  .long fpc_system_lock
 {$endif}
+{$endif}
 end;


@ -595,6 +622,32 @@ asm
  mov r0, r1
  bx  lr
 {$else}
+{$if defined(linux)}
+
+  stmfd r13!, {lr}
+  mov r2, r0   // kuser_cmpxchg does not clobber r2 by definition
+.Latomic_inc_loop:
+  ldr r0, [r2]   // Load the current value
+
+  // We expect this to work without looping most of the time
+  // R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
+  // loop here again, we have to reload the value. Normaly this just fills the
+  // load stall-cycles from the above ldr so in reality we'll not get any additional
+  // delays because of this
+  // Don't use ldr to load r3 to avoid cacheline trashing
+  // Load 0xffff0fff into r3 and substract to 0xffff0fc0,
+  // the kuser_cmpxchg entry point
+  mvn r3, #0x0000f000
+  sub r3, r3, #0x3F
+
+  add r1, r0, #1 // Decrement value
+  blx r3	 // Call kuser_cmpxchg, sets C-Flag on success
+
+  movcs r0, r1	 // We expect that to work most of the time so keep it pipeline friendly
+  ldmcsfd r13!, {pc}
+  b .Latomic_inc_loop // kuser_cmpxchg sets C flag on error
+
+{$else}
 // lock
  ldr r3, .Lfpc_system_lock
  mov r1, #1
@ -614,6 +667,7 @@ asm
 .Lfpc_system_lock:
  .long fpc_system_lock
 {$endif}
+{$endif}
 end;


@ -646,6 +700,33 @@ asm
  mov  r0, r2
  bx  lr
 {$else}
+{$if defined(linux)}
+
+  stmfd r13!, {r4, lr}
+  mov r2, r0   // kuser_cmpxchg does not clobber r2 by definition
+  mov r4, r1   // Save addend
+.Latomic_add_loop:
+  ldr r0, [r2]   // Load the current value
+
+  // We expect this to work without looping most of the time
+  // R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
+  // loop here again, we have to reload the value. Normaly this just fills the
+  // load stall-cycles from the above ldr so in reality we'll not get any additional
+  // delays because of this
+  // Don't use ldr to load r3 to avoid cacheline trashing
+  // Load 0xffff0fff into r3 and substract to 0xffff0fc0,
+  // the kuser_cmpxchg entry point
+  mvn r3, #0x0000f000
+  sub r3, r3, #0x3F
+
+  add r1, r0, r4 // Add to value
+  blx r3	 // Call kuser_cmpxchg, sets C-Flag on success
+
+  movcs r0, r1	 // We expect that to work most of the time so keep it pipeline friendly
+  ldmcsfd r13!, {r4, pc}
+  b .Latomic_add_loop // kuser_cmpxchg sets C flag on error
+
+{$else}
 // lock
  ldr r3, .Lfpc_system_lock
  mov r2, #1
@ -666,6 +747,7 @@ asm
 .Lfpc_system_lock:
  .long fpc_system_lock
 {$endif}
+{$endif}
 end;


@ -682,6 +764,23 @@ asm
  mov      r0, r3
  bx       lr
 {$else}
+{$if defined(linux)}
+
+  stmfd r13!, {lr}
+
+  mvn   r3, #0x0000f000
+  sub   r3, r3, #0x3F
+
+  mov   ip, r2 // Swap parameters around
+  mov   r2, r0
+  mov   r0, ip
+
+  blx   r3       // Call kuser_cmpxchg sets C-Flag on success
+  ldrcc r0, [r2] // Load the currently set value on failure
+                 // We could use "mov r0, r3" here, but thats undocumented
+  ldmfd r13!, {lr}
+
+{$else}
 // lock
  ldr r12, .Lfpc_system_lock
  mov r3, #1
@ -702,6 +801,7 @@ asm
 .Lfpc_system_lock:
  .long fpc_system_lock
 {$endif}
+{$endif}
 end;

 {$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}