o patch by Nico Erfurth: Better Locked* implementation for arm on linux

The following functions where changed to make use of the kernel helper
kuser_cmpxchg:
InterLockedDecrement
InterLockedIncrement
InterLockedExchangeAdd
InterLockedCompareExchange

The previous implementation using a spinlock had a couple of drawbacks:
1.) The functions could not be used safely on values not completly managed
by the process itself, because the spinlock did not protect data but the
functions. For example, think about two processes using shared memory.
They would not be able to share fpc_system_lock, making it unsafe to use
these functions.
2.) With many active threads, there was a high chance that the scheduler
would interrupt a thread while fpc_system_lock was taken, which would
result in the following threads using one of these functions to spinlock till
the end of its timeslice. This could result in unwanted and unnecessary
latencies.
3.) Every function contained a pointer to fpc_system_lock. Resulting in
two polluted DCache-Lines per call and possible latencies through dcache
misses.

The new implementation only works on Linux Kernel >= 2.6.16
The functions are implemented in a way which tries to minimize cache pollution
and load latencies.

Even without Multithreading the new functions are a lot faster. I've did
comparisons on my Kirkwood 1.2GHz with the following template code:

var X: longint;
begin
	X := 0;
	while X < longint(100*1000000) do
		FUNCTION(X);
	Writeln(X);
end.

Function                     New        Old
InterLockedIncrement:        0m3.696s   0m23.220s
InterLockedExchangeAdd:      0m4.034s   0m23.242s
InterLockedCompareExchange:  0m4.703s   0m24.006s

This speedup is most probably because of the reduced memory access,
which resulted in lots of cache misses.

git-svn-id: trunk@20491 -
This commit is contained in:
florian 2012-03-10 11:33:20 +00:00
parent 8c86455965
commit 5b03826549

View File

@ -561,6 +561,32 @@ asm
mov r0, r1
bx lr
{$else}
{$if defined(linux)}
stmfd r13!, {lr}
mov r2, r0 // kuser_cmpxchg does not clobber r2 by definition
.Latomic_dec_loop:
ldr r0, [r2] // Load the current value
// We expect this to work without looping most of the time
// R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
// loop here again, we have to reload the value. Normaly this just fills the
// load stall-cycles from the above ldr so in reality we'll not get any additional
// delays because of this
// Don't use ldr to load r3 to avoid cacheline trashing
// Load 0xffff0fff into r3 and substract to 0xffff0fc0,
// the kuser_cmpxchg entry point
mvn r3, #0x0000f000
sub r3, r3, #0x3F
sub r1, r0, #1 // Decrement value
blx r3 // Call kuser_cmpxchg, sets C-Flag on success
movcs r0, r1 // We expect that to work most of the time so keep it pipeline friendly
ldmcsfd r13!, {pc}
b .Latomic_dec_loop // kuser_cmpxchg sets C flag on error
{$else}
// lock
ldr r3, .Lfpc_system_lock
mov r1, #1
@ -580,6 +606,7 @@ asm
.Lfpc_system_lock:
.long fpc_system_lock
{$endif}
{$endif}
end;
@ -595,6 +622,32 @@ asm
mov r0, r1
bx lr
{$else}
{$if defined(linux)}
stmfd r13!, {lr}
mov r2, r0 // kuser_cmpxchg does not clobber r2 by definition
.Latomic_inc_loop:
ldr r0, [r2] // Load the current value
// We expect this to work without looping most of the time
// R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
// loop here again, we have to reload the value. Normaly this just fills the
// load stall-cycles from the above ldr so in reality we'll not get any additional
// delays because of this
// Don't use ldr to load r3 to avoid cacheline trashing
// Load 0xffff0fff into r3 and substract to 0xffff0fc0,
// the kuser_cmpxchg entry point
mvn r3, #0x0000f000
sub r3, r3, #0x3F
add r1, r0, #1 // Decrement value
blx r3 // Call kuser_cmpxchg, sets C-Flag on success
movcs r0, r1 // We expect that to work most of the time so keep it pipeline friendly
ldmcsfd r13!, {pc}
b .Latomic_inc_loop // kuser_cmpxchg sets C flag on error
{$else}
// lock
ldr r3, .Lfpc_system_lock
mov r1, #1
@ -614,6 +667,7 @@ asm
.Lfpc_system_lock:
.long fpc_system_lock
{$endif}
{$endif}
end;
@ -646,6 +700,33 @@ asm
mov r0, r2
bx lr
{$else}
{$if defined(linux)}
stmfd r13!, {r4, lr}
mov r2, r0 // kuser_cmpxchg does not clobber r2 by definition
mov r4, r1 // Save addend
.Latomic_add_loop:
ldr r0, [r2] // Load the current value
// We expect this to work without looping most of the time
// R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
// loop here again, we have to reload the value. Normaly this just fills the
// load stall-cycles from the above ldr so in reality we'll not get any additional
// delays because of this
// Don't use ldr to load r3 to avoid cacheline trashing
// Load 0xffff0fff into r3 and substract to 0xffff0fc0,
// the kuser_cmpxchg entry point
mvn r3, #0x0000f000
sub r3, r3, #0x3F
add r1, r0, r4 // Add to value
blx r3 // Call kuser_cmpxchg, sets C-Flag on success
movcs r0, r1 // We expect that to work most of the time so keep it pipeline friendly
ldmcsfd r13!, {r4, pc}
b .Latomic_add_loop // kuser_cmpxchg sets C flag on error
{$else}
// lock
ldr r3, .Lfpc_system_lock
mov r2, #1
@ -666,6 +747,7 @@ asm
.Lfpc_system_lock:
.long fpc_system_lock
{$endif}
{$endif}
end;
@ -682,6 +764,23 @@ asm
mov r0, r3
bx lr
{$else}
{$if defined(linux)}
stmfd r13!, {lr}
mvn r3, #0x0000f000
sub r3, r3, #0x3F
mov ip, r2 // Swap parameters around
mov r2, r0
mov r0, ip
blx r3 // Call kuser_cmpxchg sets C-Flag on success
ldrcc r0, [r2] // Load the currently set value on failure
// We could use "mov r0, r3" here, but thats undocumented
ldmfd r13!, {lr}
{$else}
// lock
ldr r12, .Lfpc_system_lock
mov r3, #1
@ -702,6 +801,7 @@ asm
.Lfpc_system_lock:
.long fpc_system_lock
{$endif}
{$endif}
end;
{$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}