new division helpers for ARM by Nico Erfurth. on our ARMv5 core hardware they're 22%-36% faster than the generic ones for the most common case.

git-svn-id: trunk@28273 -
This commit is contained in:
Károly Balogh 2014-07-29 17:39:55 +00:00
parent 968ddb6ad9
commit 1a4d6d79c5
3 changed files with 193 additions and 2 deletions

1
.gitattributes vendored
View File

@ -7832,6 +7832,7 @@ rtl/android/mipsel/dllprt0.as svneol=native#text/plain
rtl/android/mipsel/prt0.as svneol=native#text/plain
rtl/arm/arm.inc svneol=native#text/plain
rtl/arm/armdefines.inc svneol=native#text/plain
rtl/arm/divide.inc svneol=native#text/plain
rtl/arm/int64p.inc svneol=native#text/plain
rtl/arm/makefile.cpu svneol=native#text/plain
rtl/arm/math.inc svneol=native#text/plain

View File

@ -1115,5 +1115,4 @@ end;
{$endif}
{include hand-optimized assembler division code}
{ $i divide.inc}
{$i divide.inc}

191
rtl/arm/divide.inc Normal file
View File

@ -0,0 +1,191 @@
{
This file is part of the Free Pascal run time library.
Copyright (c) 2014 by the Free Pascal development team.
Implementation of division helpers
See the file COPYING.FPC, included in this distribution,
for details about the copyright.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
**********************************************************************}
{$if defined(CPUARM_HAS_UMULL) and defined(CPUARM_HAS_CLZ)}
{ ARM division helpers using umull to do a 32-bit division based on
this paper: http://research.microsoft.com/pubs/70645/tr-2008-141.pdf
For future optimization and testing, this file is compilable outside
the system unit. }
{$ifndef FPC_SYSTEM_HAS_DIV_DWORD}
{$define FPC_SYSTEM_HAS_DIV_DWORD}
function fpc_div_dword(n,z:dword):dword;assembler;nostackframe;
{$ifdef FPC_IS_SYSTEM}[public,alias: 'FPC_DIV_DWORD'];{$endif}
asm
// Handle division by zero
cmp r0, #0
beq .Lhandle_div_by_zero
stmfd r13!, {r4,lr}
// r1 = divisor
// r0 = dividend
// r2 = k, z
// r3 = ty, t, my
// ip = temp
// r4 = scratch
// unsigned k = clz(y);
clz r2, r0
// unsigned ty = lsr( lsl(y,k), W-9 ); // prescaling
// r3/ty will ALWAYS give a result between 256 and 511
mov r3, r0, lsl r2
mov r3, r3, lsr #23
// unsigned t = unrt[ ty - 256 ] + 256; // table lookup
adr r4, .LLeading9BitTable - 256
ldrb ip, [r4, r3]
rsb r3, r2, #31
and r3, r3, #255
add ip, ip, #256
// unsigned z = lsr( lsl(t,W-9), W-k-1 );
mov ip, ip, lsl #23
mov r2, ip, lsr r3
// unsigned my = 0-y;
rsb r3, r0, #0
// z = z + umulh(z,mul(my,z));
mul ip, r3, r2
umull r4, ip, r2, ip
add r2, r2, ip
// z = z + umulh(z,mul(my,z));
mul ip, r3, r2
umull r4, ip, r2, ip
add r2, r2, ip
// q estimate
// q = umulh(x,z);
// ip = q
umull r4, ip, r1, r2
// r = x - mul(y,q);
// r4 = r
mul r4, r0, ip
sub r4, r1, r4
// q refinement
// if (r >= y) { r = r - y; q = q + 1; }
cmp r0,r4
subls r4, r4, r0
addls ip, ip, #1
// if (r >= y) { r = r - y; q = q + 1; }
cmp r0,r4
subls r4, r4, r0
addls ip, ip, #1
mov r0, ip
mov r1, r4
ldmfd r13!, {r4,pc}
.LLeading9BitTable:
.byte 254, 252, 250, 248, 246, 244, 242, 240, 238, 236, 234, 233, 231, 229, 227, 225
.byte 224, 222, 220, 218, 217, 215, 213, 212, 210, 208, 207, 205, 203, 202, 200, 199
.byte 197, 195, 194, 192, 191, 189, 188, 186, 185, 183, 182, 180, 179, 178, 176, 175
.byte 173, 172, 170, 169, 168, 166, 165, 164, 162, 161, 160, 158, 157, 156, 154, 153
.byte 152, 151, 149, 148, 147, 146, 144, 143, 142, 141, 139, 138, 137, 136, 135, 134
.byte 132, 131, 130, 129, 128, 127, 126, 125, 123, 122, 121, 120, 119, 118, 117, 116
.byte 115, 114, 113, 112, 111, 110, 109, 108, 107, 106, 105, 104, 103, 102, 101, 100
.byte 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 88, 87, 86, 85
.byte 84, 83, 82, 81, 80, 80, 79, 78, 77, 76, 75, 74, 74, 73, 72, 71
.byte 70, 70, 69, 68, 67, 66, 66, 65, 64, 63, 62, 62, 61, 60, 59, 59
.byte 58, 57, 56, 56, 55, 54, 53, 53, 52, 51, 50, 50, 49, 48, 48, 47
.byte 46, 46, 45, 44, 43, 43, 42, 41, 41, 40, 39, 39, 38, 37, 37, 36
.byte 35, 35, 34, 33, 33, 32, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26
.byte 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 17
.byte 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 10, 10, 9, 9, 8, 8
.byte 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0
.Lhandle_div_by_zero:
mov r0, #200
mov r1, r11
{$ifdef FPC_IS_SYSTEM}
b handleerrorframe
{$endif}
end;
{It is a compilerproc (systemh.inc), make an alias for internal use.}
{$ifdef FPC_IS_SYSTEM}
function fpc_div_dword(n,z:dword):dword;external name 'FPC_DIV_DWORD';
{$endif}
{$endif}
{$ifndef FPC_SYSTEM_HAS_DIV_LONGINT}
{$define FPC_SYSTEM_HAS_DIV_LONGINT}
function fpc_div_longint(n,z:longint):longint;assembler;nostackframe;
{$ifdef FPC_IS_SYSTEM}[public,alias: 'FPC_DIV_LONGINT'];{$endif}
asm
stmfd sp!, {r4,lr}
ands r4, r0, #1<<31 (* r12:=r0 and $80000000 *)
rsbmi r0, r0, #0 (* if signed(r0) then r0:=0-r0 *)
eors r4, r4, r1, ASR#32 (* r12:=r12 xor (r1 asr 32) *)
rsbcs r1, r1, #0 (* if signed(r12) then r1:=0-r1 *)
bl fpc_div_dword
movs r4, r4, LSL#1 (* carry:=sign(r12) *)
rsbcs r0, r0, #0
rsbmi r1, r1, #0
ldmfd sp!, {r4,pc}
end;
{It is a compilerproc (systemh.inc), make an alias for internal use.}
{$ifdef FPC_IS_SYSTEM}
function fpc_div_longint(n,z:longint):longint;external name 'FPC_DIV_LONGINT';
{$endif}
{$endif}
{$ifndef FPC_SYSTEM_HAS_MOD_DWORD}
{$define FPC_SYSTEM_HAS_MOD_DWORD}
function fpc_mod_dword(n,z:dword):dword;assembler;nostackframe;
{$ifdef FPC_IS_SYSTEM}[public,alias: 'FPC_MOD_DWORD'];{$endif}
asm
stmfd sp!, {ip,lr}
bl fpc_div_dword
mov r0, r1
ldmfd sp!, {ip,pc}
end;
{It is a compilerproc (systemh.inc), make an alias for internal use.}
{$ifdef FPC_IS_SYSTEM}
function fpc_mod_dword(n,z:dword):dword;external name 'FPC_MOD_DWORD';
{$endif}
{$endif}
{$ifndef FPC_SYSTEM_HAS_MOD_LONGINT}
{$define FPC_SYSTEM_HAS_MOD_LONGINT}
function fpc_mod_longint(n,z:longint):longint;assembler;nostackframe;
{$ifdef FPC_IS_SYSTEM}[public,alias: 'FPC_MOD_LONGINT'];{$endif}
asm
stmfd sp!, {ip,lr}
bl fpc_div_longint
mov r0, r1
ldmfd sp!, {ip,pc}
end;
{It is a compilerproc (systemh.inc), make an alias for internal use.}
{$ifdef FPC_IS_SYSTEM}
function fpc_mod_longint(n,z:longint):longint;external name 'FPC_MOD_LONGINT';
{$endif}
{$endif}
{$endif}