+ added 32-bit and 64-bit unsigned asm optimized multiplication routines for

i8086, contributed by Max Nazhalov

git-svn-id: trunk@26306 -
This commit is contained in:
nickysn 2013-12-28 22:43:45 +00:00
parent 880201e56c
commit f2e73b5e6f
3 changed files with 262 additions and 1 deletions

1
.gitattributes vendored
View File

@ -7945,6 +7945,7 @@ rtl/i386/strings.inc svneol=native#text/plain
rtl/i386/stringss.inc svneol=native#text/plain
rtl/i386/strpas.inc svneol=native#text/plain
rtl/i8086/i8086.inc svneol=native#text/plain
rtl/i8086/int32p.inc svneol=native#text/plain
rtl/i8086/int64p.inc svneol=native#text/plain
rtl/i8086/makefile.cpu svneol=native#text/plain
rtl/i8086/math.inc svneol=native#text/plain

78
rtl/i8086/int32p.inc Normal file
View File

@ -0,0 +1,78 @@
{
This file is part of the Free Pascal run time library.
Copyright (c) 2013 by the Free Pascal development team
This file contains some helper routines for longint and dword
See the file COPYING.FPC, included in this distribution,
for details about the copyright.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
**********************************************************************}
{$define FPC_SYSTEM_HAS_MUL_DWORD}
function fpc_mul_dword( f1, f2: dword; checkoverflow: boolean ): dword; [public,alias: 'FPC_MUL_DWORD']; compilerproc;
begin
{ routine contributed by Max Nazhalov
//////// 16-bit multiplications summary:
(A1:A0*B1:B0) = (A1*B1)<<32 + (A1*B0)<<16 + (A0*B1)<<16 + (A0*B0)
A1*B1 [only needed for overflow checking; overflow if <>0]
A1*B0
A0*B1
A0:B0
A3*B0 [only lo_word is needed; overflow if hi_word<>0]
A2*B1 [only lo_word is needed; overflow if hi_word<>0]
A2*B0
A1*B2 [only lo_word is needed; overflow if hi_word<>0]
A0*B3 [only lo_word is needed; overflow if hi_word<>0]
A0*B2
}
asm
mov cx,word[f1]
mov ax,word[f1+2]
mov di,word[f2]
mov si,word[f2+2]
cmp checkoverflow,0
jne @@checked
mul di
xchg ax,si
mul cx
add si,ax
mov ax,di
mul cx
add dx,si
jmp @@done
@@checked:
test ax,ax
jz @@skip
test si,si
jnz @@done
mul di
test dx,dx
jnz @@done
@@skip:
xchg ax,si
mul cx
test dx,dx
jnz @@done
add si,ax
jc @@done
mov ax,di
mul cx
add dx,si
jc @@done
// checked and succeed
mov checkoverflow,0
@@done:
mov word[result],ax
mov word[result+2],dx
end [ 'ax','cx','dx','si','di' ];
if checkoverflow then
HandleErrorAddrFrameInd(215,get_pc_addr,get_frame);
end;

View File

@ -1,6 +1,6 @@
{
This file is part of the Free Pascal run time library.
Copyright (c) 1999-2000 by the Free Pascal development team
Copyright (c) 2013 by the Free Pascal development team
This file contains some helper routines for int64 and qword
@ -12,3 +12,185 @@
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
**********************************************************************}
{$I int32p.inc}
{$define FPC_SYSTEM_HAS_MUL_QWORD}
function fpc_mul_qword( f1, f2: qword; checkoverflow: longbool ): qword; [public,alias: 'FPC_MUL_QWORD']; compilerproc;
begin
{ routine contributed by Max Nazhalov
64-bit multiplication via 16-bit digits: (A3:A2:A1:A0)*(B3:B2:B1:B0)
//////// STEP 1; break-down to 32-bit multiplications, each of them generates 64-bit result:
(A3:A2*B3:B2)<<64 + (A3:A2*B1:B0)<<32 + (A1:A0*B3:B2)<<32 + (A1:A0*B1:B0)
(A1:A0*B1:B0) = (A1*B1)<<32 + (A1*B0)<<16 + (A0*B1)<<16 + (A0:B0)
-- never overflows, forms the base of the final result, name it as "R64"
(A3:A2*B3:B2) is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width.
-- always overflows if "<>0", so can be checked as "((A2|A3)<>0)&&(B2|B3)<>0)"
(A3:A2*B1:B0) and (A1:A0*B3:B2) are partially required for the final result
-- to be calculated on steps 2 and 3 as a correction for the "R64"
//////// STEP 2; calculate "R64+=(A3:A2*B1:B0)<<32" (16-bit multiplications, each of them generates 32-bit result):
(A3*B1)<<32 + (A3*B0)<<16 + (A2*B1)<<16 + (A2*B0)
((A3*B1)<<32)<<32 is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width.
-- always overflows if "<>0", so can be checked as "(A3<>0)&&(B1<>0)"
((A3*B0)<<16)<<32: only low word of "A3*B0" contributes to the final result if overflow is not checked.
-- overflows if the hi_word "<>0"
-- overflows if R64+(lo_word<<48) produces C-flag
((A2*B1)<<16)<<32: only low word of "A2*B1" contributes to the final result if overflow is not checked.
-- overflows if the hi_word "<>0"
-- overflows if R64+(lo_word<<48) produces C-flag
(A2*B0)<<32: the whole dword is significand, name it as "X"
-- overflows if R64+(X<<32) produces C-flag
//////// STEP 3; calculate "R64+=(A1:A0*B3:B2)<<32" (16-bit multiplications, each of them generates 32-bit result):
(A1*B3)<<32 + (A1*B2)<<16 + (A0*B3)<<16 + (A0*B2)
((A1*B3)<<32)<<32 is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width.
-- always overflows if "<>0", so can be checked as "(A1<>0)&&(B3<>0)"
((A1*B2)<<16)<<32: only low word of "A1*B2" contributes to the final result if overflow is not checked.
-- overflows if the hi_word "<>0"
-- overflows if R64+(lo_word<<48) produces C-flag
((A0*B3)<<16)<<32: only low word "A0*B3" contributes to the final result if overflow is not checked.
-- overflows if the hi_word "<>0"
-- overflows if R64+(lo_word<<48) produces C-flag
(A0*B2)<<32: the whole dword is significand, name it as "Y"
-- overflows if R64+(Y<<32) produces C-flag
}
asm
mov di,word[f1]
mov bx,word[f1+2]
mov si,word[f2]
mov ax,word[f2+2]
push bp
mov cx,ax
mul bx
xchg ax,bx
mov bp,dx
mul si
xchg ax,cx
add bx,dx
adc bp,0
mul di
add cx,ax
adc bx,dx
adc bp,0
mov ax,di
mul si
add cx,dx
adc bx,0
adc bp,0
mov dx,bp
pop bp
mov word[result],ax
mov word[result+2],cx
mov word[result+4],bx
mov word[result+6],dx
mov si,word[f1+4]
mov ax,word[f1+6]
mov bx,word[checkoverflow]
or bx,word[checkoverflow+2]
jnz @@checked
mov di,word[f2]
mul di
mov cx,ax
mov ax,word[f2+2]
mul si
add cx,ax
mov ax,di
mul si
mov bx,ax
add cx,dx
mov si,word[f2+4]
mov ax,word[f2+6]
mov di,word[f1]
mul di
add cx,ax
mov ax,word[f1+2]
mul si
add cx,ax
mov ax,di
mul si
add bx,ax
adc cx,dx
add word[result+4],bx
adc word[result+6],cx
jmp @@done
@@checked:
mov bx,word[f2+6]
mov cx,ax
or cx,si
jz @@nover1
mov cx,word[f2+4]
or cx,bx
jnz @@done
@@nover1:
test bx,bx
jz @@nover2
mov bx,word[f1+2]
test bx,bx
jnz @@done
@@nover2:
test ax,ax
jz @@nover3
or bx,word[f2+2]
jnz @@done
@@nover3:
mov di,word[f2]
mul di
test dx,dx
jnz @@done
mov cx,ax
mov ax,word[f2+2]
mul si
test dx,dx
jnz @@done
add cx,ax
jc @@done
mov ax,di
mul si
mov bx,ax
add cx,dx
jc @@done
mov si,word[f2+4]
mov ax,word[f2+6]
mov di,word[f1]
mul di
test dx,dx
jnz @@done
add cx,ax
jc @@done
mov ax,word[f1+2]
mul si
test dx,dx
jnz @@done
add cx,ax
jc @@done
mov ax,di
mul si
add bx,ax
adc cx,dx
jc @@done
add word[result+4],bx
adc word[result+6],cx
jc @@done
// checked and succeed
xor ax,ax
mov word[checkoverflow],ax
mov word[checkoverflow+2],ax
@@done:
end [ 'ax','bx','cx','dx','si','di' ];
if checkoverflow then
HandleErrorAddrFrameInd(215,get_pc_addr,get_frame);
end;