Work around load latency in InterlockedExchange for ARM

An LDR will have two load latency cycles on most ARM implementations, moving the mov r4, r0 two instructions away from the corresponding ldr will avoid the stalls. git-svn-id: trunk@22107 -
2025-04-23 02:29:34 +02:00 · 2012-08-17 12:42:49 +00:00 · 2012-08-17 12:42:49 +00:00 · 6729164fcc
commit 6729164fcc
parent 7e5b8584cf
1 changed files with 1 additions and 1 deletions
--- a/rtl/arm/arm.inc
+++ b/rtl/arm/arm.inc
@ -713,7 +713,6 @@ asm
  mov r2, r0   // kuser_cmpxchg does not clobber r2 (and r1) by definition
 .Latomic_add_loop:
  ldr r0, [r2]   // Load the current value
-  mov r4, r0     // save the current value because kuser_cmpxchg clobbers r0

  // We expect this to work without looping most of the time
  // R3 gets clobbered in kuser_cmpxchg so in the unlikely case that we have to
@ -725,6 +724,7 @@ asm
  // the kuser_cmpxchg entry point
  mvn r3, #0x0000f000
  sub r3, r3, #0x3F
+  mov r4, r0     // save the current value because kuser_cmpxchg clobbers r0

  blx r3	 // Call kuser_cmpxchg, sets C-Flag on success
  // restore the original value if needed