{ This file is part of the Free Pascal run time library. Copyright (c) 2013 by the Free Pascal development team This file contains some helper routines for int64 and qword See the file COPYING.FPC, included in this distribution, for details about the copyright. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. **********************************************************************} {$ifndef VER3_0} {$define FPC_SYSTEM_HAS_MUL_QWORD} function fpc_mul_qword( f1, f2: qword): qword; [public,alias: 'FPC_MUL_QWORD']; compilerproc; begin { routine contributed by Max Nazhalov 64-bit multiplication via 16-bit digits: (A3:A2:A1:A0)*(B3:B2:B1:B0) //////// STEP 1; break-down to 32-bit multiplications, each of them generates 64-bit result: (A3:A2*B3:B2)<<64 + (A3:A2*B1:B0)<<32 + (A1:A0*B3:B2)<<32 + (A1:A0*B1:B0) (A1:A0*B1:B0) = (A1*B1)<<32 + (A1*B0)<<16 + (A0*B1)<<16 + (A0:B0) -- never overflows, forms the base of the final result, name it as "R64" (A3:A2*B3:B2) is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width. -- always overflows if "<>0", so can be checked as "((A2|A3)<>0)&&(B2|B3)<>0)" (A3:A2*B1:B0) and (A1:A0*B3:B2) are partially required for the final result -- to be calculated on steps 2 and 3 as a correction for the "R64" //////// STEP 2; calculate "R64+=(A3:A2*B1:B0)<<32" (16-bit multiplications, each of them generates 32-bit result): (A3*B1)<<32 + (A3*B0)<<16 + (A2*B1)<<16 + (A2*B0) ((A3*B1)<<32)<<32 is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width. -- always overflows if "<>0", so can be checked as "(A3<>0)&&(B1<>0)" ((A3*B0)<<16)<<32: only low word of "A3*B0" contributes to the final result if overflow is not checked. -- overflows if the hi_word "<>0" -- overflows if R64+(lo_word<<48) produces C-flag ((A2*B1)<<16)<<32: only low word of "A2*B1" contributes to the final result if overflow is not checked. -- overflows if the hi_word "<>0" -- overflows if R64+(lo_word<<48) produces C-flag (A2*B0)<<32: the whole dword is significand, name it as "X" -- overflows if R64+(X<<32) produces C-flag //////// STEP 3; calculate "R64+=(A1:A0*B3:B2)<<32" (16-bit multiplications, each of them generates 32-bit result): (A1*B3)<<32 + (A1*B2)<<16 + (A0*B3)<<16 + (A0*B2) ((A1*B3)<<32)<<32 is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width. -- always overflows if "<>0", so can be checked as "(A1<>0)&&(B3<>0)" ((A1*B2)<<16)<<32: only low word of "A1*B2" contributes to the final result if overflow is not checked. -- overflows if the hi_word "<>0" -- overflows if R64+(lo_word<<48) produces C-flag ((A0*B3)<<16)<<32: only low word "A0*B3" contributes to the final result if overflow is not checked. -- overflows if the hi_word "<>0" -- overflows if R64+(lo_word<<48) produces C-flag (A0*B2)<<32: the whole dword is significand, name it as "Y" -- overflows if R64+(Y<<32) produces C-flag } asm mov di,word[f1] mov bx,word[f1+2] mov si,word[f2] mov ax,word[f2+2] push bp mov cx,ax mul bx xchg ax,bx mov bp,dx mul si xchg ax,cx add bx,dx adc bp,0 mul di add cx,ax adc bx,dx adc bp,0 mov ax,di mul si add cx,dx adc bx,0 adc bp,0 mov dx,bp pop bp mov word[result],ax mov word[result+2],cx mov word[result+4],bx mov word[result+6],dx mov si,word[f1+4] mov ax,word[f1+6] mov di,word[f2] mul di mov cx,ax mov ax,word[f2+2] mul si add cx,ax mov ax,di mul si mov bx,ax add cx,dx mov si,word[f2+4] mov ax,word[f2+6] mov di,word[f1] mul di add cx,ax mov ax,word[f1+2] mul si add cx,ax mov ax,di mul si add bx,ax adc cx,dx add word[result+4],bx adc word[result+6],cx end [ 'ax','bx','cx','dx','si','di' ]; end; function fpc_mul_qword_checkoverflow( f1, f2: qword): qword; [public,alias: 'FPC_MUL_QWORD_CHECKOVERFLOW']; compilerproc; begin { routine contributed by Max Nazhalov 64-bit multiplication via 16-bit digits: (A3:A2:A1:A0)*(B3:B2:B1:B0) //////// STEP 1; break-down to 32-bit multiplications, each of them generates 64-bit result: (A3:A2*B3:B2)<<64 + (A3:A2*B1:B0)<<32 + (A1:A0*B3:B2)<<32 + (A1:A0*B1:B0) (A1:A0*B1:B0) = (A1*B1)<<32 + (A1*B0)<<16 + (A0*B1)<<16 + (A0:B0) -- never overflows, forms the base of the final result, name it as "R64" (A3:A2*B3:B2) is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width. -- always overflows if "<>0", so can be checked as "((A2|A3)<>0)&&(B2|B3)<>0)" (A3:A2*B1:B0) and (A1:A0*B3:B2) are partially required for the final result -- to be calculated on steps 2 and 3 as a correction for the "R64" //////// STEP 2; calculate "R64+=(A3:A2*B1:B0)<<32" (16-bit multiplications, each of them generates 32-bit result): (A3*B1)<<32 + (A3*B0)<<16 + (A2*B1)<<16 + (A2*B0) ((A3*B1)<<32)<<32 is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width. -- always overflows if "<>0", so can be checked as "(A3<>0)&&(B1<>0)" ((A3*B0)<<16)<<32: only low word of "A3*B0" contributes to the final result if overflow is not checked. -- overflows if the hi_word "<>0" -- overflows if R64+(lo_word<<48) produces C-flag ((A2*B1)<<16)<<32: only low word of "A2*B1" contributes to the final result if overflow is not checked. -- overflows if the hi_word "<>0" -- overflows if R64+(lo_word<<48) produces C-flag (A2*B0)<<32: the whole dword is significand, name it as "X" -- overflows if R64+(X<<32) produces C-flag //////// STEP 3; calculate "R64+=(A1:A0*B3:B2)<<32" (16-bit multiplications, each of them generates 32-bit result): (A1*B3)<<32 + (A1*B2)<<16 + (A0*B3)<<16 + (A0*B2) ((A1*B3)<<32)<<32 is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width. -- always overflows if "<>0", so can be checked as "(A1<>0)&&(B3<>0)" ((A1*B2)<<16)<<32: only low word of "A1*B2" contributes to the final result if overflow is not checked. -- overflows if the hi_word "<>0" -- overflows if R64+(lo_word<<48) produces C-flag ((A0*B3)<<16)<<32: only low word "A0*B3" contributes to the final result if overflow is not checked. -- overflows if the hi_word "<>0" -- overflows if R64+(lo_word<<48) produces C-flag (A0*B2)<<32: the whole dword is significand, name it as "Y" -- overflows if R64+(Y<<32) produces C-flag } asm mov di,word[f1] mov bx,word[f1+2] mov si,word[f2] mov ax,word[f2+2] push bp mov cx,ax mul bx xchg ax,bx mov bp,dx mul si xchg ax,cx add bx,dx adc bp,0 mul di add cx,ax adc bx,dx adc bp,0 mov ax,di mul si add cx,dx adc bx,0 adc bp,0 mov dx,bp pop bp mov word[result],ax mov word[result+2],cx mov word[result+4],bx mov word[result+6],dx mov si,word[f1+4] mov ax,word[f1+6] mov bx,word[f2+6] mov cx,ax or cx,si jz @@nover1 mov cx,word[f2+4] or cx,bx jnz @@overflow @@nover1: test bx,bx jz @@nover2 mov bx,word[f1+2] test bx,bx jnz @@overflow @@nover2: test ax,ax jz @@nover3 or bx,word[f2+2] jnz @@overflow @@nover3: mov di,word[f2] mul di test dx,dx jnz @@overflow mov cx,ax mov ax,word[f2+2] mul si test dx,dx jnz @@overflow add cx,ax jc @@overflow mov ax,di mul si mov bx,ax add cx,dx jc @@overflow mov si,word[f2+4] mov ax,word[f2+6] mov di,word[f1] mul di test dx,dx jnz @@overflow add cx,ax jc @@overflow mov ax,word[f1+2] mul si test dx,dx jnz @@overflow add cx,ax jc @@overflow mov ax,di mul si add bx,ax adc cx,dx jc @@overflow add word[result+4],bx adc word[result+6],cx jnc @@done @@overflow: call FPC_OVERFLOW @@done: end [ 'ax','bx','cx','dx','si','di' ]; end; {$endif VER3_0} {$define FPC_SYSTEM_HAS_MUL_DWORD_TO_QWORD} function fpc_mul_dword_to_qword(f1,f2 : dword) : qword;[public,alias: 'FPC_MUL_DWORD_TO_QWORD']; compilerproc; assembler; nostackframe; asm push bp mov bp, sp mov di,word[bp + 8 + extra_param_offset] // word[f1] mov bx,word[bp + 10 + extra_param_offset] // word[f1+2] mov si,word[bp + 4 + extra_param_offset] // word[f2] mov ax,word[bp + 6 + extra_param_offset] // word[f2+2] mov cx,ax mul bx xchg ax,bx mov bp,dx mul si xchg ax,cx add bx,dx adc bp,0 mul di add cx,ax adc bx,dx adc bp,0 mov ax,di mul si add cx,dx adc bx,0 adc bp,0 mov dx,ax mov ax,bp pop bp end; {$define FPC_SYSTEM_HAS_DIV_QWORD} function fpc_div_qword( n, z: qword ): qword; [public, alias:'FPC_DIV_QWORD']; compilerproc; // Generic "schoolbook" division algorithm // see [D.Knuth, TAOCP, vol.2, sect.4.3.1] for explanation var dig: byte; u: array [0..6] of word; begin asm mov dig,3 // quotient contains 3 digits for "long" division path // Check parameters mov dx,word [n] mov cx,word [n+2] mov bx,word [n+4] mov ax,word [n+6] mov di,ax or di,bx or di,cx jnz @@s1 or di,dx jz @@q // div by 0 // Short division mov dig,al mov dx,word [z+6] cmp dx,di jc @@s0 xchg ax,dx div di @@s0: mov word [result+6],ax mov ax,word [z+4] div di mov word [result+4],ax mov ax,word [z+2] div di mov word [result+2],ax mov ax,word [z] div di mov word [result],ax jmp @@q @@s1: // Long division xor si,si cmp word [z],dx mov di,word [z+2] sbb di,cx mov di,word [z+4] sbb di,bx mov di,word [z+6] sbb di,ax jnc @@n0 // underflow: return 0 mov dig,0 mov word [result],si mov word [result+2],si mov word [result+4],si mov word [result+6],si jmp @@q @@n0: // D1. Normalize divisor: // n := n shl lzv, so that 2^63<=n<2^64 // Note: n>=0x10000 leads to lzv<=47 and q3=0 mov word [result+6],si // q3:=0 mov di,si test ax,ax jnz @@n2 @@n1: add si,16 or ax,bx mov bx,cx mov cx,dx mov dx,di jz @@n1 @@n2: test ah,ah jnz @@n4 add si,8 or ah,al mov al,bh mov bh,bl mov bl,ch mov ch,cl mov cl,dh mov dh,dl mov dl,0 js @@n5 @@n3: inc si shl dx,1 rcl cx,1 rcl bx,1 adc ax,ax @@n4: jns @@n3 @@n5: mov word [n],dx mov word [n+2],cx mov word [n+4],bx mov word [n+6],ax // Adjust divident accordingly: // u := uint128(z) shl lzv; lzv=si=0..47; di=0 mov dx,word [z] mov cx,word [z+2] mov bx,word [z+4] mov ax,word [z+6] push bp mov bp,si // save lzv test si,8 jz @@m0 // << by odd-8 xchg al,ah mov di,ax and di,0FFh mov al,bh mov bh,bl mov bl,ch mov ch,cl mov cl,dh mov dh,dl xor dl,dl @@m0: and si,7 jz @@m2 // << 1..7 @@m1: shl dx,1 rcl cx,1 rcl bx,1 rcl ax,1 rcl di,1 dec si jnz @@m1 @@m2: // si=0, bp=lzv // di:ax:bx:cx:dx shifted by 0..15; 0|16|32 shifts remain sub bp,16 jc @@m5 sub bp,16 jc @@m4 // << 32 pop bp mov word [u],si mov word [u+2],si mov word [u+4],dx mov word [u+6],cx mov word [u+8],bx mov word [u+10],ax mov word [u+12],di jmp @@m6 @@m4: // << 16 pop bp mov word [u],si mov word [u+2],dx mov word [u+4],cx mov word [u+6],bx mov word [u+8],ax mov word [u+10],di mov word [u+12],si jmp @@m6 @@m5: // << 0 pop bp mov word [u],dx mov word [u+2],cx mov word [u+4],bx mov word [u+6],ax mov word [u+8],di mov word [u+10],si mov word [u+12],si @@m6: // D2. Start from j:=2 (since u7=0 and u60 then HandleErrorAddrFrameInd(200,get_pc_addr,get_frame); end; {$define FPC_SYSTEM_HAS_MOD_QWORD} function fpc_mod_qword( n, z: qword ): qword; [public, alias:'FPC_MOD_QWORD']; compilerproc; // Generic "schoolbook" division algorithm // see [D.Knuth, TAOCP, vol.2, sect.4.3.1] for explanation var dig: byte; lzv: word; u: array [0..6] of word; begin asm mov dig,3 // quotient contains 3 digist for "long" division path // Check parameters mov dx,word [n] mov cx,word [n+2] mov bx,word [n+4] mov ax,word [n+6] mov di,ax or di,bx or di,cx jnz @@s1 or di,dx jz @@q // div by 0 // Short division mov dig,al mov dx,word [z+6] cmp dx,di jc @@s0 xchg ax,dx div di @@s0: mov ax,word [z+4] div di mov ax,word [z+2] div di mov ax,word [z] div di mov word [result],dx mov word [result+2],cx mov word [result+4],cx mov word [result+6],cx jmp @@q @@s1: // Long division xor si,si cmp word [z],dx mov di,word [z+2] sbb di,cx mov di,word [z+4] sbb di,bx mov di,word [z+6] sbb di,ax jnc @@n0 // underflow: return z mov dig,0 mov dx,word [z] mov cx,word [z+2] mov bx,word [z+4] mov ax,word [z+6] jmp @@r6 @@n0: // D1. Normalize divisor: // n := n shl lzv, so that 2^63<=n<2^64 // Note: n>=0x10000 leads to lzv<=47 mov di,si test ax,ax jnz @@n2 @@n1: add si,16 or ax,bx mov bx,cx mov cx,dx mov dx,di jz @@n1 @@n2: test ah,ah jnz @@n4 add si,8 or ah,al mov al,bh mov bh,bl mov bl,ch mov ch,cl mov cl,dh mov dh,dl mov dl,0 js @@n5 @@n3: inc si shl dx,1 rcl cx,1 rcl bx,1 adc ax,ax @@n4: jns @@n3 @@n5: mov word [n],dx mov word [n+2],cx mov word [n+4],bx mov word [n+6],ax mov lzv,si // Adjust divident accordingly: // u := uint128(z) shl lzv; lzv=si=0..47; di=0 mov dx,word [z] mov cx,word [z+2] mov bx,word [z+4] mov ax,word [z+6] push bp mov bp,si // save lzv test si,8 jz @@m0 // << by odd-8 xchg al,ah mov di,ax and di,0FFh mov al,bh mov bh,bl mov bl,ch mov ch,cl mov cl,dh mov dh,dl xor dl,dl @@m0: and si,7 jz @@m2 // << 1..7 @@m1: shl dx,1 rcl cx,1 rcl bx,1 rcl ax,1 rcl di,1 dec si jnz @@m1 @@m2: // si=0, bp=lzv // di:ax:bx:cx:dx shifted by 0..15; 0|16|32 shifts remain sub bp,16 jc @@m5 sub bp,16 jc @@m4 // << 32 pop bp mov word [u],si mov word [u+2],si mov word [u+4],dx mov word [u+6],cx mov word [u+8],bx mov word [u+10],ax mov word [u+12],di jmp @@m6 @@m4: // << 16 pop bp mov word [u],si mov word [u+2],dx mov word [u+4],cx mov word [u+6],bx mov word [u+8],ax mov word [u+10],di mov word [u+12],si jmp @@m6 @@m5: // << 0 pop bp mov word [u],dx mov word [u+2],cx mov word [u+4],bx mov word [u+6],ax mov word [u+8],di mov word [u+10],si mov word [u+12],si @@m6: // D2. Start from j:=2 (since u7=0 and u6> 32..47 mov bx,ax mov cx,word [u+6] mov dx,word [u+4] jmp @@r3 @@r1: // >> 16..31 mov bx,word [u+6] mov cx,word [u+4] mov dx,word [u+2] jmp @@r3 @@r2: // >> 0..15 mov ax,word [u+6] mov bx,word [u+4] mov cx,word [u+2] mov dx,word [u] @@r3: and si,15 sub si,8 jc @@r4 // >> 8..15 mov dl,dh mov dh,cl mov cl,ch mov ch,bl mov bl,bh mov bh,al mov al,ah xor ah,ah @@r4: and si,7 jz @@r6 // >> 1..7 @@r5: shr ax,1 rcr bx,1 rcr cx,1 rcr dx,1 dec si jnz @@r5 @@r6: mov word [result],dx mov word [result+2],cx mov word [result+4],bx mov word [result+6],ax @@q: end; if dig<>0 then HandleErrorAddrFrameInd(200,get_pc_addr,get_frame); end;