- Adds a number of optimizations for 64bit integer operations on AVR. Patch from Christo Crause in issue #35691.

git-svn-id: trunk@42495 -
This commit is contained in:
Jeppe Johansen 2019-07-25 15:36:24 +00:00
parent 4b93eb64b3
commit 893507a5d6

View File

@ -12,3 +12,554 @@
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
**********************************************************************}
{$define FPC_SYSTEM_HAS_SHR_QWORD}
// Simplistic version with checking if whole bytes can be shifted
// Doesn't change bitshift portion even if possible because of byteshift
// Shorter code but not shortest execution time version
function fpc_shr_qword(value: qword; shift: ALUUInt): qword; assembler; nostackframe;
[public, alias: 'FPC_SHR_QWORD']; compilerproc;
label
byteshift, bitshift, finish;
asm
// value passed in R25...R18
// shift passed in R16
// return value in R25...R18
push R16
andi R16, 63 // mask 64 bit relevant value per generic routine
byteshift:
breq finish // shift = 0, finished
cpi R16, 8 // Check if shift is at least a byte
brlo bitshift
mov R18, R19 // if so, then shift all bytes right by 1 position
mov R19, R20
mov R20, R21
mov R21, R22
mov R22, R23
mov R23, R24
mov R24, R25
clr R25 // and clear the high byte
subi R16, 8 // subtract 8 bits from shift
rjmp byteshift // check if another byte can be shifted
bitshift: // shift all 8 bytes right by 1 bit
lsr R25
ror R24
ror R23
ror R22
ror R21
ror R20
ror R19
ror R18
dec R16
brne bitshift // until R16 = 0
finish:
pop R16
end;
function fpc_shr_qword(value: qword; shift: ALUUInt): qword; external name 'FPC_SHR_QWORD';
{$define FPC_SYSTEM_HAS_SHL_QWORD}
function fpc_shl_qword(value: qword; shift: ALUUInt): qword; assembler; nostackframe;
[public, alias: 'FPC_SHL_QWORD']; compilerproc;
label
byteshift, bitshift, finish;
asm
// value passed in R25...R18
// shift passed in R16
// return value in R25...R18
push R16
andi R16, 63 // mask 64 bit relevant value per generic routine
byteshift:
breq finish // shift = 0, finished
cpi R16, 8 // Check if shift is at least a byte
brlo bitshift
mov R25, R24 // if so, then shift all bytes left by 1 position
mov R24, R23
mov R23, R22
mov R22, R21
mov R21, R20
mov R20, R19
mov R19, R18
clr R18 // and clear the high byte
subi R16, 8 // subtract 8 bits from shift
rjmp byteshift // check if another byte can be shifted
bitshift: // shift all 8 bytes left by 1 bit
lsl R18
rol R19
rol R20
rol R21
rol R22
rol R23
rol R24
rol R25
dec R16
brne bitshift // until R16 = 0
finish:
pop R16
end;
function fpc_shl_qword(value: qword; shift: ALUUInt): qword; external name 'FPC_SHL_QWORD';
{$define FPC_SYSTEM_HAS_SHL_INT64}
function fpc_shl_int64(value: int64; shift: ALUUInt): int64;
[public, alias: 'FPC_SHL_INT64']; compilerproc; inline;
begin
Result := fpc_shl_qword(qword(value), shift);
end;
{$define FPC_SYSTEM_HAS_SHR_INT64}
// shr of signed int is same as shr of unsigned int (logical shift right)
function fpc_shr_int64(value: int64; shift: ALUUInt): int64; [public, alias: 'FPC_SHR_INT64']; compilerproc;
begin
Result := fpc_shr_qword(qword(value), shift);
end;
{$define FPC_SYSTEM_HAS_DIV_QWORD}
function fpc_div_qword(n,z : qword): qword; nostackframe; assembler; [public,alias: 'FPC_DIV_QWORD']; compilerproc;
label
start, div1, div2, div3, finish;
asm
// Symbol Name Register(s)
// z (A) dividend R17, R16, R15, R14, R13, R12, R11, R10
// n (B) divisor R25, R24, R23, R22, R21, R20, R19, R18
// r (P) remainder R9, R8, R7, R6, R5, R4, R3, R2
// i counter R26
// 1 R27
cp R25, R1
cpc R24, R1
cpc R23, R1
cpc R22, R1
cpc R21, R1
cpc R20, R1
cpc R19, R1
cpc R18, R1
brne .LNonZero
{$ifdef CPUAVR_HAS_JMP_CALL}
call fpc_divbyzero
{$else CPUAVR_HAS_JMP_CALL}
rcall fpc_divbyzero
{$endif CPUAVR_HAS_JMP_CALL}
.LNonZero:
push R17
push R16
push R15
push R14
push R13
push R12
push R11
push R10
push R9
push R8
push R7
push R6
push R5
push R4
push R3
push R2
ldi R27, 1 // needed below for OR instruction
start: // Start of division...
clr R9 // clear remainder
clr R8
clr R7
clr R6
clr R5
clr R4
clr R3
clr R2
ldi R26, 64 // iterate over 64 bits
div1:
lsl R10 // shift left A_L
rol R11
rol R12
rol R13
rol R14
rol R15
rol R16
rol R17
rol R2 // shift left P with carry from A shift
rol R3
rol R4
rol R5
rol R6
rol R7
rol R8
rol R9
sub R2, R18 // Subtract B from P, P <= P - B
sbc R3, R19
sbc R4, R20
sbc R5, R21
sbc R6, R22
sbc R7, R23
sbc R8, R24
sbc R9, R25
brlo div2
or R10, R27 // Set A[0] = 1
rjmp div3
div2: // negative branch, A[0] = 0 (default after shift), restore P
add R2, R18 // restore old value of P
adc R3, R19
adc R4, R20
adc R5, R21
adc R6, R22
adc R7, R23
adc R8, R24
adc R9, R25
div3:
dec R26
breq finish
rjmp div1
finish:
mov R25, R17 // Move answer from R17..10 to R25..18
mov R24, R16
mov R23, R15
mov R22, R14
mov R21, R13
mov R20, R12
mov R19, R11
mov R18, R10
pop R2
pop R3
pop R4
pop R5
pop R6
pop R7
pop R8
pop R9
pop R10
pop R11
pop R12
pop R13
pop R14
pop R15
pop R16
pop R17
end;
function fpc_div_qword(n,z : qword): qword; external name 'FPC_DIV_QWORD';
{$define FPC_SYSTEM_HAS_MOD_QWORD}
function fpc_mod_qword(n,z : qword): qword; nostackframe; assembler; [public,alias: 'FPC_MOD_QWORD']; compilerproc;
label
start, div1, div2, div3, finish;
asm
// Symbol Name Register(s)
// z (A) dividend R17, R16, R15, R14, R13, R12, R11, R10
// n (B) divisor R25, R24, R23, R22, R21, R20, R19, R18
// r (P) remainder R9, R8, R7, R6, R5, R4, R3, R2
// i counter R26
// 1 R27
cp R25, R1
cpc R24, R1
cpc R23, R1
cpc R22, R1
cpc R21, R1
cpc R20, R1
cpc R19, R1
cpc R18, R1
brne .LNonZero
{$ifdef CPUAVR_HAS_JMP_CALL}
call fpc_divbyzero
{$else CPUAVR_HAS_JMP_CALL}
rcall fpc_divbyzero
{$endif CPUAVR_HAS_JMP_CALL}
.LNonZero:
push R17
push R16
push R15
push R14
push R13
push R12
push R11
push R10
push R9
push R8
push R7
push R6
push R5
push R4
push R3
push R2
ldi R27, 1
start: // Start of division...
clr R9 // clear remainder
clr R8
clr R7
clr R6
clr R5
clr R4
clr R3
clr R2
ldi R26, 64 // iterate over 64 bits
div1:
lsl R10 // shift left A_L
rol R11
rol R12
rol R13
rol R14
rol R15
rol R16
rol R17
rol R2 // shift left P with carry from A shift
rol R3
rol R4
rol R5
rol R6
rol R7
rol R8
rol R9
sub R2, R18 // Subtract B from P, P <= P - B
sbc R3, R19
sbc R4, R20
sbc R5, R21
sbc R6, R22
sbc R7, R23
sbc R8, R24
sbc R9, R25
brlo div2
or R10, R27 // Set A[0] = 1
rjmp div3
div2: // negative branch, A[0] = 0 (default after shift), restore P
add R2, R18 // restore old value of P
adc R3, R19
adc R4, R20
adc R5, R21
adc R6, R22
adc R7, R23
adc R8, R24
adc R9, R25
div3:
dec R26
breq finish
rjmp div1
finish:
mov R25, R9 // Move answer from R9..2 to R25..18
mov R24, R8
mov R23, R7
mov R22, R6
mov R21, R5
mov R20, R4
mov R19, R3
mov R18, R2
pop R2
pop R3
pop R4
pop R5
pop R6
pop R7
pop R8
pop R9
pop R10
pop R11
pop R12
pop R13
pop R14
pop R15
pop R16
pop R17
end;
function fpc_mod_qword(n,z : qword): qword; external name 'FPC_MOD_QWORD';
{$define FPC_SYSTEM_HAS_DIV_INT64}
function fpc_div_int64(n,z : int64) : int64; nostackframe; assembler; [public,alias: 'FPC_DIV_INT64']; compilerproc;
label
pos1, pos2, fin;
asm
// Convert n, z to unsigned int, then call div_qword,
// Restore sign if high bits of n xor z is negative
// n divisor R25, R24, R23, R22, R21, R20, R19, R18
// z dividend R17, R16, R15, R14, R13, R12, R11, R10
// neg_result R30
// one R31
mov R30, R17 // store hi8(z)
eor R30, R25 // hi8(z) XOR hi8(n), answer must be negative if MSB set
// convert n to absolute
ldi R31, 1 // 1 in R31 used later
sub R25, r1 // subtract 0, just to check sign flag
brpl pos1
com R25
com R24
com R23
com R22
com R21
com R20
com R19
com R18
add R18, R31 // add 1
adc R19, R1 // add carry bit
adc R20, R1
adc R21, R1
adc R22, R1
adc R23, R1
adc R24, R1
adc R25, R1
pos1:
sub R17, R1
brpl pos2
com R17
com R16
com R15
com R14
com R13
com R12
com R11
com R10
add R10, R31
adc R11, R1
adc R12, R1
adc R13, R1
adc R14, R1
adc R15, R1
adc R16, R1
adc R17, R1
pos2:
{$ifdef CPUAVR_HAS_JMP_CALL}
call fpc_div_qword
{$else CPUAVR_HAS_JMP_CALL}
rcall fpc_div_qword
{$endif CPUAVR_HAS_JMP_CALL}
sbrs R30, 7 // skip if bit 7 is cleared (result should be positive)
rjmp fin
com R25 // result from FPC_DIV_WORD in R25 ... R22
com R24
com R23
com R22
com R21
com R20
com R19
com R18
ldi R31, 1
add R18, R31 // add 1
adc R19, R1 // add carry bit
adc R20, R1
adc R21, R1
adc R22, R1
adc R23, R1
adc R24, R1
adc R25, R1
fin:
end;
{$define FPC_SYSTEM_HAS_MOD_INT64}
function fpc_mod_int64(n,z : int64) : int64; nostackframe; assembler; [public,alias: 'FPC_MOD_INT64']; compilerproc;
label
pos1, pos2, fin;
asm
// Convert n, z to unsigned int, then call mod_qword,
// Restore sign if high bits of n xor z is negative
// n divisor R25, R24, R23, R22, R21, R20, R19, R18
// z dividend R17, R16, R15, R14, R13, R12, R11, R10
// neg_result R30
// one R31
mov R30, R17 // store hi8(z)
// convert n to absolute
ldi R31, 1
sub R25, r1 // subtract 0, just to check sign flag
brpl pos1
com R25
com R24
com R23
com R22
com R21
com R20
com R19
com R18
add R18, R31 // add 1
adc R19, R1 // add carry bit
adc R20, R1
adc R21, R1
adc R22, R1
adc R23, R1
adc R24, R1
adc R25, R1
pos1:
sub R17, R1
brpl pos2
com R17
com R16
com R15
com R14
com R13
com R12
com R11
com R10
add R10, R31
adc R11, R1
adc R12, R1
adc R13, R1
adc R14, R1
adc R15, R1
adc R16, R1
adc R17, R1
pos2:
{$ifdef CPUAVR_HAS_JMP_CALL}
call fpc_mod_qword
{$else CPUAVR_HAS_JMP_CALL}
rcall fpc_mod_qword
{$endif CPUAVR_HAS_JMP_CALL}
sbrs R30, 7 // Not finished if sign bit is set
rjmp fin
com R25 // Convert to 2's complement
com R24 // Complement all bits...
com R23
com R22
com R21
com R20
com R19
com R18
ldi R31, 1
add R18, R31 // ...and add 1 to answer
adc R19, R1
adc R20, R1
adc R21, R1
adc R22, R1
adc R23, R1
adc R24, R1
adc R25, R1
fin:
end;