o patch by Nico Erfurth: Support Assembly optimized functions of SwapEndian on ARM

Currently the ARM-Port uses generic functions for SwapEndian, which are relativly slow. This patch adds optimized functions for the 32 and 64-bit case, the 16 bit case is still handled with a normal function, while the generated code is far from optimal, the inlining (which is not possible with asm-functions) makes it faster than the optimized function. Some Numbers from my 1.2GHz Kirkwood (ARMv5): Old New Result SwapEndian(Integer) 12.168s 5.411s 44.47% SwapEndian(Int64) 168.28s 9.015s 5.36% Testcode was begin I := $FFFFFFF; while I > 0 do begin Val2 := MySwapEndian(Val); Dec(I); end; end. Currently only the ARM implementation is tested. ARMv6+ includes a rev instruction, while I've implemented them, I was not able to test them. git-svn-id: trunk@20685 -
2025-04-20 18:09:27 +02:00 · 2012-04-01 17:31:49 +00:00 · 2012-04-01 17:31:49 +00:00 · df0201799e
commit df0201799e
parent e0ae28b967
1 changed files with 121 additions and 0 deletions
--- a/rtl/arm/arm.inc
+++ b/rtl/arm/arm.inc
@ -855,6 +855,127 @@ begin
 {$endif FPC_SYSTEM_FPC_MOVE}
 end;

+{$define FPC_SYSTEM_HAS_SWAPENDIAN}
+
+{ SwapEndian(<16 Bit>) being inlined is faster than using assembler }
+function SwapEndian(const AValue: SmallInt): SmallInt;{$ifdef SYSTEMINLINE}inline;{$endif}
+  begin
+    { the extra Word type cast is necessary because the "AValue shr 8" }
+    { is turned into "longint(AValue) shr 8", so if AValue < 0 then    }
+    { the sign bits from the upper 16 bits are shifted in rather than  }
+    { zeroes.                                                          }
+    Result := SmallInt((Word(AValue) shr 8) or (Word(AValue) shl 8));
+  end;
+
+
+function SwapEndian(const AValue: Word): Word;{$ifdef SYSTEMINLINE}inline;{$endif}
+  begin
+    Result := Word((AValue shr 8) or (AValue shl 8));
+  end;
+
+(*
+This is kept for reference. Thats what the compiler COULD generate in these cases.
+But FPC currently does not support inlining of asm-functions, so the whole call-overhead
+is bigger than the gain of the optimized function.
+function AsmSwapEndian(const AValue: SmallInt): SmallInt;{$ifdef SYSTEMINLINE}inline;{$endif};assembler;nostackframe;
+asm
+	// We're starting with 4321
+{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+	mov r0, r0, shl #16	// Shift to make that 2100
+	mov r0, r0, ror #24	// Rotate to 1002
+	orr r0, r0, r0 shr #16  // Shift and combine into 0012
+{$else}
+	rev r0, r0		// Reverse byteorder    r0 = 1234
+	mov r0, r0, shr #16	// Shift down to 16bits r0 = 0012
+{$endif}
+end;
+
+*)
+
+function SwapEndian(const AValue: LongInt): LongInt;assembler;nostackframe;
+asm
+        // We're starting with r0 = 4321
+{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+        mov r2, r0, lsr #24             // r2 = 0004
+        and r1, r0, #16711680           // r1 = 0300
+        orr r2, r2, r0, lsl #24         // r2 = 1004
+        orr r2, r2, r1, lsr #8          // r2 = 1034
+        and r0, r0, #65280              // r0 = 0020
+        orr r0, r2, r0, lsl #8          // r0 = 1234
+{$else}
+	rev r0, r0
+{$endif}
+end;
+
+function SwapEndian(const AValue: DWord): DWord;assembler;nostackframe;
+asm
+        // We're starting with r0 = 4321
+{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+        mov r2, r0, lsr #24             // r2 = 0004
+        and r1, r0, #16711680           // r1 = 0300
+        orr r2, r2, r0, lsl #24         // r2 = 1004
+        orr r2, r2, r1, lsr #8          // r2 = 1034
+        and r0, r0, #65280              // r0 = 0020
+        orr r0, r2, r0, lsl #8          // r0 = 1234
+{$else}
+	rev r0, r0
+{$endif}
+end;
+
+function SwapEndian(const AValue: Int64): Int64; assembler; nostackframe;
+asm
+        // We're starting with r0 = 4321 r1 = 8765
+{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+        mov ip, r1
+
+        mov r2, r0, lsr #24             // r2 = 0004
+        and r3, r0, #16711680           // r3 = 0300
+        orr r2, r2, r0, lsl #24         // r2 = 1004
+        orr r2, r2, r3, lsr #8          // r2 = 1034
+        and r0, r0, #65280              // r0 = 0020
+        orr r1, r2, r0, lsl #8          // r1 = 1234
+
+        mov r2, ip, lsr #24             // r2 = 0008
+        and r3, ip, #16711680           // r1 = 0700
+        orr r2, r2, ip, lsl #24         // r2 = 5008
+        orr r2, r2, r3, lsr #8          // r2 = 5078
+        and ip, ip, #65280              // ip = 0060
+        orr r0, r2, ip, lsl #8          // r0 = 5678
+        bx lr
+{$else}
+	rev r2, r0
+	rev r0, r1
+	mov r1, r2
+{$endif}
+end;
+
+function SwapEndian(const AValue: QWord): QWord; assembler; nostackframe;
+asm
+        // We're starting with r0 = 4321 r1 = 8765
+{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+        mov ip, r1
+
+        mov r2, r0, lsr #24             // r2 = 0004
+        and r3, r0, #16711680           // r3 = 0300
+        orr r2, r2, r0, lsl #24         // r2 = 1004
+        orr r2, r2, r3, lsr #8          // r2 = 1034
+        and r0, r0, #65280              // r0 = 0020
+        orr r1, r2, r0, lsl #8          // r1 = 1234
+
+        mov r2, ip, lsr #24             // r2 = 0008
+        and r3, ip, #16711680           // r1 = 0700
+        orr r2, r2, ip, lsl #24         // r2 = 5008
+        orr r2, r2, r3, lsr #8          // r2 = 5078
+        and ip, ip, #65280              // ip = 0060
+        orr r0, r2, ip, lsl #8          // r0 = 5678
+        bx lr
+{$else}
+	rev r2, r0
+	rev r0, r1
+	mov r1, r2
+{$endif}
+end;
+
 {include hand-optimized assembler division code}
 {$i divide.inc}