* split fpc_mul_<64 bit> into separate procedures with and without overflow checking

git-svn-id: trunk@35454 -
2025-09-15 12:49:13 +02:00 · 2017-02-19 19:15:14 +00:00 · 2017-02-19 19:15:14 +00:00 · 7213a13081
commit 7213a13081
parent 3900ccf260
7 changed files with 345 additions and 56 deletions
--- a/compiler/nadd.pas
+++ b/compiler/nadd.pas
@ -2944,9 +2944,7 @@ implementation
              end;

            { otherwise, create the parameters for the helper }
-            right := ccallparanode.create(
-              cordconstnode.create(ord(cs_check_overflow in current_settings.localswitches),pasbool8type,true),
-              ccallparanode.create(right,ccallparanode.create(left,nil)));
+            right :=   ccallparanode.create(right,ccallparanode.create(left,nil));
            left := nil;
            { only qword needs the unsigned code, the
              signed code is also used for currency }
@ -2954,6 +2952,9 @@ implementation
              procname := 'fpc_mul_int64'
            else
              procname := 'fpc_mul_qword';
+            if cs_check_overflow in current_settings.localswitches then
+              procname := procname + '_checkoverflow';
+
            result := ccallnode.createintern(procname,right);
            right := nil;
          end;
--- a/rtl/arm/int64p.inc
+++ b/rtl/arm/int64p.inc
@ -13,9 +13,38 @@

 **********************************************************************}

+{$ifndef VER3_0}
 {$if (not defined(CPUTHUMB)) and defined(CPUARM_HAS_UMULL)}
 {$define FPC_SYSTEM_HAS_MUL_QWORD}
-function fpc_mul_qword(f1,f2 : qword;checkoverflow : longbool) : qword;assembler;nostackframe;[public,alias: 'FPC_MUL_QWORD']; compilerproc;
+function fpc_mul_qword(f1,f2 : qword) : qword;assembler;nostackframe;[public,alias: 'FPC_MUL_QWORD']; compilerproc;
+asm
+  stmfd sp!,{r4,r5,r6,r14}
+  mov r6,#0
+  // r4 = result lo, r5 = result hi
+{$ifdef ENDIAN_LITTLE}
+  // lo(f1)*lo(f2)
+  umull r4,r5,r0,r2
+  // lo(f1)*hi(f2)
+  umlal r5,r6,r0,r3
+  // hi(f1)*lo(f2)
+  umlal r5,r6,r1,r2
+  mov r0,r4
+  mov r1,r5
+{$else}
+  // lo(f1)*lo(f2)
+  umull r4,r5,r1,r3
+  // lo(f1)*hi(f2)
+  umlal r5,r6,r1,r2
+  // hi(f1)*lo(f2)
+  umlal r5,r6,r0,r3
+  mov r1,r4
+  mov r0,r5
+{$endif}
+  ldmfd sp!,{r4,r5,r6,r15}
+end;
+
+
+function fpc_mul_qword_checkoverflow(f1,f2 : qword) : qword;assembler;nostackframe;[public,alias: 'FPC_MUL_QWORD_CHECKOVERFLOW']; compilerproc;
 asm
  stmfd sp!,{r4,r5,r6,r14}
  mov r6,#0
@ -51,10 +80,6 @@ asm
 {$endif}
  // no overflow?
  beq .Lexit
-  // checkoverflow?
-  ldr r2,[sp,#16]
-  cmp r2,#0
-  beq .Lexit

  mov r0,#215
  mov r1,fp
@ -63,4 +88,4 @@ asm
  ldmfd sp!,{r4,r5,r6,r15}
 end;
 {$endif (not defined(CPUTHUMB)) and defined(CPUARM_HAS_UMULL)}
-
+{$endif VER3_0}
--- a/rtl/i386/int64p.inc
+++ b/rtl/i386/int64p.inc
@ -361,15 +361,10 @@
            movl saveedi,%edi
      end;

+{$ifndef VER3_0}
 {$define FPC_SYSTEM_HAS_MUL_QWORD}
-    { multiplies two qwords
-      the longbool for checkoverflow avoids a misaligned stack
-    }
-    function fpc_mul_qword(f1,f2 : qword;checkoverflow : longbool) : qword;[public,alias: 'FPC_MUL_QWORD']; compilerproc;
-      var
-        overflowed : boolean;
+    function fpc_mul_qword(f1,f2 : qword) : qword;[public,alias: 'FPC_MUL_QWORD']; compilerproc;
      begin
-        overflowed:=false;
        { the following piece of code is taken from the
          AMD Athlon Processor x86 Code Optimization manual }
        asm
@ -383,8 +378,6 @@
           mull %edx
           jmp .Lqwordmulready
        .Lqwordmultwomul:
-           cmpl $0,checkoverflow
-           jnz  .Loverflowchecked
           imul f1+4,%edx
           imul %eax,%ecx
           addl %edx,%ecx
@ -393,6 +386,29 @@
        .Lqwordmulready:
           movl %eax,__RESULT
           movl %edx,__RESULT+4
+        .Lend:
+        end [ 'eax','edx','ecx'];
+      end;
+
+	
+    function fpc_mul_qword_checkoverflow(f1,f2 : qword) : qword;[public,alias: 'FPC_MUL_QWORD_CHECKOVERFLOW']; compilerproc;
+      var
+        overflowed : boolean;
+      begin
+        overflowed:=false;
+        { the following piece of code is taken from the
+          AMD Athlon Processor x86 Code Optimization manual }
+        asm
+           movl f1+4,%edx
+           movl f2+4,%ecx
+           orl %ecx,%edx
+           movl f2,%edx
+           movl f1,%eax
+           jnz .Loverflowchecked
+           { if both upper dwords are =0 then it cannot overflow }
+           mull %edx
+           movl %eax,__RESULT
+           movl %edx,__RESULT+4
           jmp .Lend

        .Loverflowchecked:
@ -431,4 +447,4 @@
        if overflowed then
          HandleErrorFrame(215,get_frame);
      end;
-
+{$endif VER3_0}
--- a/rtl/i8086/int64p.inc
+++ b/rtl/i8086/int64p.inc
@ -1,3 +1,4 @@
+
 {
    This file is part of the Free Pascal run time library.
    Copyright (c) 2013 by the Free Pascal development team
@ -13,8 +14,9 @@

 **********************************************************************}

+{$ifndef VER3_0}
 {$define FPC_SYSTEM_HAS_MUL_QWORD}
-function fpc_mul_qword( f1, f2: qword; checkoverflow: longbool ): qword; [public,alias: 'FPC_MUL_QWORD']; compilerproc;
+function fpc_mul_qword( f1, f2: qword): qword; [public,alias: 'FPC_MUL_QWORD']; compilerproc;
 begin
 { routine contributed by Max Nazhalov

@ -97,9 +99,6 @@ begin
    mov     word[result+6],dx
    mov     si,word[f1+4]
    mov     ax,word[f1+6]
-    mov     bx,word[checkoverflow]
-    or      bx,word[checkoverflow+2]
-    jnz     @@checked
    mov     di,word[f2]
    mul     di
    mov     cx,ax
@ -124,74 +123,156 @@ begin
    adc     cx,dx
    add     word[result+4],bx
    adc     word[result+6],cx
-    jmp     @@done
-@@checked:
+  end [ 'ax','bx','cx','dx','si','di' ];
+end;
+
+
+function fpc_mul_qword_checkoverflow( f1, f2: qword): qword; [public,alias: 'FPC_MUL_QWORD_CHECKOVERFLOW']; compilerproc;
+begin
+{ routine contributed by Max Nazhalov
+
+64-bit multiplication via 16-bit digits: (A3:A2:A1:A0)*(B3:B2:B1:B0)
+
+//////// STEP 1; break-down to 32-bit multiplications, each of them generates 64-bit result:
+  (A3:A2*B3:B2)<<64 + (A3:A2*B1:B0)<<32 + (A1:A0*B3:B2)<<32 + (A1:A0*B1:B0)
+
+(A1:A0*B1:B0) = (A1*B1)<<32 + (A1*B0)<<16 + (A0*B1)<<16 + (A0:B0)
+ -- never overflows, forms the base of the final result, name it as "R64"
+
+(A3:A2*B3:B2) is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width.
+ -- always overflows if "<>0", so can be checked as "((A2|A3)<>0)&&(B2|B3)<>0)"
+
+(A3:A2*B1:B0) and (A1:A0*B3:B2) are partially required for the final result
+ -- to be calculated on steps 2 and 3 as a correction for the "R64"
+
+//////// STEP 2; calculate "R64+=(A3:A2*B1:B0)<<32" (16-bit multiplications, each of them generates 32-bit result):
+  (A3*B1)<<32 + (A3*B0)<<16 + (A2*B1)<<16 + (A2*B0)
+
+((A3*B1)<<32)<<32 is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width.
+ -- always overflows if "<>0", so can be checked as "(A3<>0)&&(B1<>0)"
+
+((A3*B0)<<16)<<32: only low word of "A3*B0" contributes to the final result if overflow is not checked.
+ -- overflows if the hi_word "<>0"
+ -- overflows if R64+(lo_word<<48) produces C-flag
+
+((A2*B1)<<16)<<32: only low word of "A2*B1" contributes to the final result if overflow is not checked.
+ -- overflows if the hi_word "<>0"
+ -- overflows if R64+(lo_word<<48) produces C-flag
+
+(A2*B0)<<32: the whole dword is significand, name it as "X"
+ -- overflows if R64+(X<<32) produces C-flag
+
+//////// STEP 3; calculate "R64+=(A1:A0*B3:B2)<<32" (16-bit multiplications, each of them generates 32-bit result):
+  (A1*B3)<<32 + (A1*B2)<<16 + (A0*B3)<<16 + (A0*B2)
+
+((A1*B3)<<32)<<32 is not required for the 64-bit result if overflow is not checked, since it is completely beyond the resulting width.
+ -- always overflows if "<>0", so can be checked as "(A1<>0)&&(B3<>0)"
+
+((A1*B2)<<16)<<32: only low word of "A1*B2" contributes to the final result if overflow is not checked.
+ -- overflows if the hi_word "<>0"
+ -- overflows if R64+(lo_word<<48) produces C-flag
+
+((A0*B3)<<16)<<32: only low word "A0*B3" contributes to the final result if overflow is not checked.
+ -- overflows if the hi_word "<>0"
+ -- overflows if R64+(lo_word<<48) produces C-flag
+
+(A0*B2)<<32: the whole dword is significand, name it as "Y"
+ -- overflows if R64+(Y<<32) produces C-flag
+}
+  asm
+    mov     di,word[f1]
+    mov     bx,word[f1+2]
+    mov     si,word[f2]
+    mov     ax,word[f2+2]
+    push    bp
+    mov     cx,ax
+    mul     bx
+    xchg    ax,bx
+    mov     bp,dx
+    mul     si
+    xchg    ax,cx
+    add     bx,dx
+    adc     bp,0
+    mul     di
+    add     cx,ax
+    adc     bx,dx
+    adc     bp,0
+    mov     ax,di
+    mul     si
+    add     cx,dx
+    adc     bx,0
+    adc     bp,0
+    mov     dx,bp
+    pop     bp
+    mov     word[result],ax
+    mov     word[result+2],cx
+    mov     word[result+4],bx
+    mov     word[result+6],dx
+    mov     si,word[f1+4]
+    mov     ax,word[f1+6]
    mov     bx,word[f2+6]
    mov     cx,ax
    or      cx,si
    jz      @@nover1
    mov     cx,word[f2+4]
    or      cx,bx
-    jnz     @@done
+    jnz     @@overflow
@@nover1:
    test    bx,bx
    jz      @@nover2
    mov     bx,word[f1+2]
    test    bx,bx
-    jnz     @@done
+    jnz     @@overflow
@@nover2:
    test    ax,ax
    jz      @@nover3
    or      bx,word[f2+2]
-    jnz     @@done
+    jnz     @@overflow
@@nover3:
    mov     di,word[f2]
    mul     di
    test    dx,dx
-    jnz     @@done
+    jnz     @@overflow
    mov     cx,ax
    mov     ax,word[f2+2]
    mul     si
    test    dx,dx
-    jnz     @@done
+    jnz     @@overflow
    add     cx,ax
-    jc      @@done
+    jc      @@overflow
    mov     ax,di
    mul     si
    mov     bx,ax
    add     cx,dx
-    jc      @@done
+    jc      @@overflow
    mov     si,word[f2+4]
    mov     ax,word[f2+6]
    mov     di,word[f1]
    mul     di
    test    dx,dx
-    jnz     @@done
+    jnz     @@overflow
    add     cx,ax
-    jc      @@done
+    jc      @@overflow
    mov     ax,word[f1+2]
    mul     si
    test    dx,dx
-    jnz     @@done
+    jnz     @@overflow
    add     cx,ax
-    jc      @@done
+    jc      @@overflow
    mov     ax,di
    mul     si
    add     bx,ax
    adc     cx,dx
-    jc      @@done
+    jc      @@overflow
    add     word[result+4],bx
    adc     word[result+6],cx
-    jc      @@done
-    // checked and succeed
-    xor     ax,ax
-    mov     word[checkoverflow],ax
-    mov     word[checkoverflow+2],ax
+    jnc     @@done
+@@overflow:
+	call	FPC_OVERFLOW
@@done:
  end [ 'ax','bx','cx','dx','si','di' ];
-  if checkoverflow then
-    HandleErrorAddrFrameInd(215,get_pc_addr,get_frame);
 end;
+{$endif VER3_0}


 {$define FPC_SYSTEM_HAS_MUL_DWORD_TO_QWORD}
--- a/rtl/inc/compproc.inc
+++ b/rtl/inc/compproc.inc
@ -602,8 +602,15 @@ function fpc_div_qword(n,z : qword) : qword; compilerproc;
 function fpc_mod_qword(n,z : qword) : qword; compilerproc;
 function fpc_div_int64(n,z : int64) : int64; compilerproc;
 function fpc_mod_int64(n,z : int64) : int64; compilerproc;
+{$ifdef VER3_0}
 function fpc_mul_qword(f1,f2 : qword;checkoverflow : longbool) : qword; compilerproc;
 function fpc_mul_int64(f1,f2 : int64;checkoverflow : longbool) : int64; compilerproc;
+{$else VER3_0}
+function fpc_mul_qword(f1,f2 : qword) : qword; compilerproc;
+function fpc_mul_qword_checkoverflow(f1,f2 : qword) : qword; compilerproc;
+function fpc_mul_int64(f1,f2 : int64) : int64; compilerproc;
+function fpc_mul_int64_checkoverflow(f1,f2 : int64) : int64; compilerproc;
+{$endif VER3_0}
 function fpc_mul_dword_to_qword(f1,f2 : dword) : qword; compilerproc;
 function fpc_mul_longint_to_int64(f1,f2 : longint) : int64; compilerproc;

--- a/rtl/inc/int64.inc
+++ b/rtl/inc/int64.inc
@ -1,3 +1,4 @@
+
 {
    This file is part of the Free Pascal run time library.
    Copyright (c) 1999-2000 by the Free Pascal development team
@ -267,6 +268,7 @@
      end;
 {$endif FPC_SYSTEM_HAS_MOD_INT64}

+{$ifdef VER3_0}

 {$ifndef FPC_SYSTEM_HAS_MUL_QWORD}
    { multiplies two qwords
@ -304,24 +306,77 @@
      end;
 {$endif FPC_SYSTEM_HAS_MUL_QWORD}

+{$else VER3_0}
+
+{$ifndef FPC_SYSTEM_HAS_MUL_QWORD}
+    function fpc_mul_qword(f1,f2 : qword) : qword;[public,alias: 'FPC_MUL_QWORD']; compilerproc;
+      var
+         bitpos : qword;
+         l : longint;
+      begin
+        result:=0;
+        bitpos:=1;
+
+        for l:=0 to 63 do
+          begin
+            if (f2 and bitpos)<>0 then
+              result:=result+f1;
+            f1:=f1 shl 1;
+            bitpos:=bitpos shl 1;
+          end;
+      end;
+
+
+    function fpc_mul_qword_checkoverflow(f1,f2 : qword) : qword;[public,alias: 'FPC_MUL_QWORD_CHECKOVERFLOW']; compilerproc;
+      var
+         _f1,bitpos : qword;
+         l : longint;
+         f1overflowed : boolean;
+      begin
+        result:=0;
+        bitpos:=1;
+        f1overflowed:=false;
+
+        for l:=0 to 63 do
+          begin
+            if (f2 and bitpos)<>0 then
+              begin
+                _f1:=result;
+                result:=result+f1;
+
+                { if one of the operands is greater than the result an
+                  overflow occurs                                      }
+                if (f1overflowed or ((_f1<>0) and (f1<>0) and
+                  ((_f1>result) or (f1>result)))) then
+                  HandleErrorAddrFrameInd(215,get_pc_addr,get_frame);
+              end;
+            { when bootstrapping, we forget about overflow checking for qword :) }
+            f1overflowed:=f1overflowed or ((f1 and (qword(1) shl 63))<>0);
+            f1:=f1 shl 1;
+            bitpos:=bitpos shl 1;
+          end;
+      end;
+{$endif FPC_SYSTEM_HAS_MUL_QWORD}
+
+{$endif VER3_0}

 {$ifndef FPC_SYSTEM_HAS_MUL_DWORD_TO_QWORD}
-    function fpc_mul_qword_compilerproc(f1,f2 : qword;checkoverflow : longbool) : qword; external name 'FPC_MUL_QWORD';
+    function fpc_mul_qword_compilerproc(f1,f2 : qword) : qword; external name 'FPC_MUL_QWORD';

    function fpc_mul_dword_to_qword(f1,f2 : dword) : qword;[public,alias: 'FPC_MUL_DWORD_TO_QWORD']; compilerproc;
      begin
-        fpc_mul_dword_to_qword:=fpc_mul_qword_compilerproc(f1,f2,false);
+        fpc_mul_dword_to_qword:=fpc_mul_qword_compilerproc(f1,f2);
      end;
 {$endif FPC_SYSTEM_HAS_MUL_DWORD_TO_QWORD}


+{$ifdef VER3_0}
+
 {$ifndef FPC_SYSTEM_HAS_MUL_INT64}
    function fpc_mul_int64(f1,f2 : int64;checkoverflow : longbool) : int64;[public,alias: 'FPC_MUL_INT64']; compilerproc;
-
      var
         sign : boolean;
         q1,q2,q3 : qword;
-
      begin
 {$ifdef EXCLUDE_COMPLEX_PROCS}
         runerror(219);
@ -370,9 +425,68 @@
      end;
 {$endif FPC_SYSTEM_HAS_MUL_INT64}

+{$else VER3_0}
+
+{$ifndef FPC_SYSTEM_HAS_MUL_INT64}
+    function fpc_mul_int64(f1,f2 : int64) : int64;[public,alias: 'FPC_MUL_INT64']; compilerproc;
+      begin
+        { there's no difference between signed and unsigned multiplication,
+          when the destination size is equal to the source size and overflow
+          checking is off }
+        { qword(f1)*qword(f2) is coded as a call to mulqword }
+        result:=int64(qword(f1)*qword(f2));
+      end;
+
+
+    function fpc_mul_int64_checkoverflow(f1,f2 : int64) : int64;[public,alias: 'FPC_MUL_INT64_CHECKOVERFLOW']; compilerproc;
+{$ifdef EXCLUDE_COMPLEX_PROCS}
+      begin
+        runerror(217);
+      end;
+{$else EXCLUDE_COMPLEX_PROCS}
+      var
+         sign : boolean;
+         q1,q2,q3 : qword;
+      begin
+        sign:=false;
+        if f1<0 then
+          begin
+            sign:=not(sign);
+            q1:=qword(-f1);
+          end
+        else
+          q1:=f1;
+        if f2<0 then
+          begin
+            sign:=not(sign);
+            q2:=qword(-f2);
+          end
+        else
+          q2:=f2;
+        { the q1*q2 is coded as call to mulqword }
+        q3:=q1*q2;
+
+        if (q1 <> 0) and (q2 <>0) and
+          ((q1>q3) or (q2>q3) or
+            { the bit 63 can be only set if we have $80000000 00000000 }
+            { and sign is true                                         }
+            (q3 shr 63<>0) and
+             ((q3<>qword(qword(1) shl 63)) or not(sign))
+            ) then
+          HandleErrorAddrFrameInd(215,get_pc_addr,get_frame);
+
+        if sign then
+          result:=-q3
+        else
+          result:=q3;
+      end;
+{$endif EXCLUDE_COMPLEX_PROCS}
+{$endif FPC_SYSTEM_HAS_MUL_INT64}
+
+{$endif VER3_0}

 {$ifndef FPC_SYSTEM_HAS_MUL_LONGINT_TO_INT64}
-    function fpc_mul_int64_compilerproc(f1,f2 : int64;checkoverflow : longbool) : int64; external name 'FPC_MUL_INT64';
+    function fpc_mul_int64_compilerproc(f1,f2 : int64) : int64; external name 'FPC_MUL_INT64';

    function fpc_mul_longint_to_int64(f1,f2 : longint) : int64;[public,alias: 'FPC_MUL_LONGINT_TO_INT64']; compilerproc;
 {$ifdef EXCLUDE_COMPLEX_PROCS}
@ -381,7 +495,7 @@
      end;
 {$else EXCLUDE_COMPLEX_PROCS}
      begin
-        fpc_mul_longint_to_int64:=fpc_mul_int64_compilerproc(f1,f2,false);
+        fpc_mul_longint_to_int64:=fpc_mul_int64_compilerproc(f1,f2);
      end;
 {$endif EXCLUDE_COMPLEX_PROCS}

--- a/rtl/powerpc/int64p.inc
+++ b/rtl/powerpc/int64p.inc
@ -140,11 +140,9 @@
        mr   R4,R6
      end;

+{$ifndef VER3_0}	
 {$define FPC_SYSTEM_HAS_MUL_QWORD}
-    { multiplies two qwords
-      the longbool for checkoverflow avoids a misaligned stack
-    }
-    function fpc_mul_qword(f1,f2 : qword;checkoverflow : longbool) : qword;[public,alias: 'FPC_MUL_QWORD']; compilerproc;
+    function fpc_mul_qword(f1,f2 : qword) : qword;[public,alias: 'FPC_MUL_QWORD']; compilerproc;
      assembler; nostackframe;
      asm
        // (r3:r4) = (r3:r4) * (r5:r6),  checkoverflow is in r7
@ -185,6 +183,54 @@
        add     r9,r9,r12
        cmplwi  cr7,r9,64   // is the sum now >= 64?
        cmplwi  cr1,r9,62   // or <= 62?
+
+      .LDone:
+        mullw   r4,r4,r6    // lsw of product of lsw's
+        mr      r3,r8       // get msw of product in correct register
+      end;
+	
+	
+    function fpc_mul_qword_checkoverflow(f1,f2 : qword) : qword;[public,alias: 'FPC_MUL_QWORD_CHECKOVERFLOW']; compilerproc;
+      assembler; nostackframe;
+      asm
+        // (r3:r4) = (r3:r4) * (r5:r6),  checkoverflow is in r7
+        //   res        f1        f2
+
+        or.     r10,r3,r5    // are both msw's 0?
+        mulhwu  r8,r4,r6    // msw of product of lsw's
+        xor     r0,r0,r0    // r0 := 0 for overflow checking
+        beq     .LDone      // if both msw's are zero, skip cross products
+        mullw   r9,r4,r5    // lsw of first cross-product
+        cntlzw  r11,r3      // count leading zeroes of msw1
+        cntlzw  r12,r5      // count leading zeroes of msw2
+        mullw   r7,r3,r6    // lsw of second cross-product
+        add     r12,r11,r12  // sum of leading zeroes
+        mr      r10,r8
+        or      r0,r12,r0    // maximise sum if no overflow checking, otherwise it remains
+        add     r8,r8,r9    // add
+        cmplwi  cr1,r0,64   // >= 64 leading zero bits in total? If so, no overflow
+        add     r8,r8,r7    // add
+        bge+    cr1,.LDone  // if the sum of leading zero's >= 64 (or checkoverflow was 0)
+                            // there's no overflow, otherwise more thorough check
+        add     r7,r7,r9
+        mulhwu  r3,r6,r3
+        addc    r7,r7,r10   // add the msw of the product of the lsw's, record carry
+        cntlzw  r9,r5
+        cntlzw  r10,r4      // get leading zeroes count of lsw f1
+        mulhwu  r5,r4,r5
+        addze   r3,r3
+        subfic  r0,r11,31   // if msw f1 = 0, then r0 := -1, else r0 >= 0
+        cntlzw  r7,r6
+        subfic  r11,r9,31   // same for f2
+        srawi   r0,r0,31    // if msw f1 = 0, then r0 := 1, else r0 := 0
+        srawi   r11,r11,31
+        and     r10,r10,r0    // if msw f1 <> 0, the leading zero count lsw f1 := 0
+        and     r9,r7,r11     // same for f2
+        or.     r5,r5,r3
+        add     r9,r9,r10    // add leading zero counts of lsw's to sum if appropriate
+        add     r9,r9,r12
+        cmplwi  cr7,r9,64   // is the sum now >= 64?
+        cmplwi  cr1,r9,62   // or <= 62?
        bge+    cr7,.LDone      // >= 64 leading zeroes -> no overflow
        ble+    cr1,.LOverflow  // <= 62 leading zeroes -> overflow
                            // for 63 zeroes, we need additional checks
@ -198,5 +244,4 @@
        mullw   r4,r4,r6    // lsw of product of lsw's
        mr      r3,r8       // get msw of product in correct register
      end;
-
-
+{$endif VER3_0}