From 81b2cf5d65d9969277b2dbee61848c7b238ed350 Mon Sep 17 00:00:00 2001
From: florian <florian@freepascal.org>
Date: Sat, 6 Jan 2018 14:58:28 +0000
Subject: [PATCH] * slightly modified patch by J. Gareth Moreton: Optimization
 for 'mod' on i386/x86-64, resolves #32945

git-svn-id: trunk@37922 -
---
 .gitattributes            |   2 +
 compiler/x86/nx86mat.pas  | 107 +++++++++++++++++++++++++++++++++---
 tests/test/cg/tmoddiv3.pp | 103 +++++++++++++++++++++++++++++++++++
 tests/test/cg/tmoddiv4.pp | 110 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 314 insertions(+), 8 deletions(-)
 create mode 100644 tests/test/cg/tmoddiv3.pp
 create mode 100644 tests/test/cg/tmoddiv4.pp

diff --git a/.gitattributes b/.gitattributes
index 94b0bcfc93..190aaca598 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -11948,6 +11948,8 @@ tests/test/cg/tmanypar.pp svneol=native#text/plain
 tests/test/cg/tmoddiv.pp svneol=native#text/plain
 tests/test/cg/tmoddiv1.pp svneol=native#text/plain
 tests/test/cg/tmoddiv2.pp svneol=native#text/plain
+tests/test/cg/tmoddiv3.pp svneol=native#text/pascal
+tests/test/cg/tmoddiv4.pp svneol=native#text/pascal
 tests/test/cg/tmul3264.pp svneol=native#text/plain
 tests/test/cg/tneg.pp svneol=native#text/plain
 tests/test/cg/tnegnotassign1.pp svneol=native#text/plain
diff --git a/compiler/x86/nx86mat.pas b/compiler/x86/nx86mat.pas
index b13f5c85d4..d0b20aa562 100644
--- a/compiler/x86/nx86mat.pas
+++ b/compiler/x86/nx86mat.pas
@@ -55,10 +55,10 @@ interface
       constexp,
       cutils,verbose,globals,
       symconst,symdef,
-      aasmbase,aasmtai,aasmdata,defutil,
+      aasmbase,aasmtai,aasmcpu,aasmdata,defutil,
       cgbase,pass_1,pass_2,
       ncon,
-      cpubase,
+      cpubase,cpuinfo,
       cga,cgobj,hlcgobj,cgx86,cgutils;
 
 
@@ -378,8 +378,9 @@ interface
 
     procedure tx86moddivnode.pass_generate_code;
       var
-        hreg1,hreg2,rega,regd:Tregister;
+        hreg1,hreg2,hreg3,rega,regd:Tregister;
         power:longint;
+        instr:TAiCpu;
         op:Tasmop;
         cgsize:TCgSize;
         opsize:topsize;
@@ -387,6 +388,8 @@ interface
         d,m: aword;
         m_add, invertsign: boolean;
         s: byte;
+      label
+        DefaultDiv;
       begin
         secondpass(left);
         if codegenerror then
@@ -522,15 +525,103 @@ interface
               end;
           end
         { unsigned modulus by a (+/-)power-of-2 constant? }
-        else if (nodetype=modn) and (right.nodetype=ordconstn) and
-                isabspowerof2(tordconstnode(right).value,power) and
-                not(is_signed(left.resultdef)) then
+        else if (nodetype=modn) and (right.nodetype=ordconstn) and not(is_signed(left.resultdef)) then
           begin
-            emit_const_reg(A_AND,opsize,(aint(1) shl power)-1,hreg1);
-            location.register:=hreg1;
+            if isabspowerof2(tordconstnode(right).value,power) then
+              begin
+                emit_const_reg(A_AND,opsize,(aint(1) shl power)-1,hreg1);
+                location.register:=hreg1;
+              end
+            else
+              begin
+                d:=tordconstnode(right).value.svalue;
+                if d>=aword(1) shl (left.resultdef.size*8-1) then
+                  begin
+
+                    if not (CPUX86_HAS_CMOV in cpu_capabilities[current_settings.cputype]) then
+                      goto DefaultDiv;
+
+                    location.register:=cg.getintregister(current_asmdata.CurrAsmList,cgsize);
+                    hreg3:=cg.getintregister(current_asmdata.CurrAsmList,cgsize);
+
+                    m := aword(-aint(d)); { Two's complement of d }
+
+                    if (cgsize in [OS_64,OS_S64]) then { Cannot use 64-bit constants in CMP }
+                      begin
+                        hreg2:=cg.getintregister(current_asmdata.CurrAsmList,cgsize);
+                        emit_const_reg(A_MOV,opsize,aint(d),hreg2);
+                        emit_const_reg(A_MOV,opsize,aint(m),hreg3);
+                        emit_reg_reg(A_XOR,opsize,location.register,location.register);
+                        cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
+                        emit_reg_reg(A_CMP,opsize,hreg2,hreg1);
+
+                        { Emit conditional move that depends on the carry flag }
+                        instr:=TAiCpu.op_reg_reg(A_CMOVcc,opsize,hreg3,location.register);
+                        instr.condition := C_AE;
+                        current_asmdata.CurrAsmList.concat(instr);
+                        cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
+                      end
+                    else
+                      begin
+                        emit_const_reg(A_MOV,opsize,aint(m),hreg3);
+                        emit_reg_reg(A_XOR,opsize,location.register,location.register);
+
+                        cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
+                        emit_const_reg(A_CMP,opsize,aint(d),hreg1);
+
+                        { Emit conditional move that depends on the carry flag }
+                        instr:=TAiCpu.op_reg_reg(A_CMOVcc,opsize,hreg3,location.register);
+                        instr.condition := C_AE;
+                        current_asmdata.CurrAsmList.concat(instr);
+                        cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
+                      end;
+
+                    emit_reg_reg(A_ADD,opsize,hreg1,location.register);
+                  end
+                else
+                  begin
+                    { Convert the division to a multiplication }
+                    calc_divconst_magic_unsigned(resultdef.size*8,d,m,m_add,s);
+                    cg.getcpuregister(current_asmdata.CurrAsmList,rega);
+                    emit_const_reg(A_MOV,opsize,aint(m),rega);
+                    cg.getcpuregister(current_asmdata.CurrAsmList,regd);
+                    emit_reg(A_MUL,opsize,hreg1);
+                    cg.ungetcpuregister(current_asmdata.CurrAsmList,rega);
+                    hreg2:=cg.getintregister(current_asmdata.CurrAsmList,cgsize);
+                    emit_reg_reg(A_MOV,opsize,hreg1,hreg2);
+                    if m_add then
+                      begin
+                        { addition can overflow, shift first bit considering carry,
+                          then shift remaining bits in regular way. }
+                        cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
+                        emit_reg_reg(A_ADD,opsize,hreg1,regd);
+                        emit_const_reg(A_RCR,opsize,1,regd);
+                        cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_DEFAULTFLAGS);
+                        dec(s);
+                      end;
+                    if s<>0 then
+                      emit_const_reg(A_SHR,opsize,aint(s),regd);
+
+                    if (cgsize in [OS_64,OS_S64]) then { Cannot use 64-bit constants in IMUL }
+                      begin
+                        hreg3:=cg.getintregister(current_asmdata.CurrAsmList,cgsize);
+                        emit_const_reg(A_MOV,opsize,aint(d),hreg3);
+                        emit_reg_reg(A_IMUL,opsize,hreg3,regd);
+                      end
+                    else
+                      emit_const_reg(A_IMUL,opsize,aint(d),regd);
+
+                    emit_reg_reg(A_SUB,opsize,regd,hreg2);
+                    cg.ungetcpuregister(current_asmdata.CurrAsmList,regd);
+                    location.register:=cg.getintregister(current_asmdata.CurrAsmList,cgsize);
+                    cg.a_load_reg_reg(current_asmdata.CurrAsmList,cgsize,cgsize,hreg2,location.register)
+                  end;
+
+              end;
           end
         else
           begin
+DefaultDiv:
             {Bring denominator to a register.}
             cg.getcpuregister(current_asmdata.CurrAsmList,rega);
             emit_reg_reg(A_MOV,opsize,hreg1,rega);
diff --git a/tests/test/cg/tmoddiv3.pp b/tests/test/cg/tmoddiv3.pp
new file mode 100644
index 0000000000..bfcd59d9d6
--- /dev/null
+++ b/tests/test/cg/tmoddiv3.pp
@@ -0,0 +1,103 @@
+program testfile2;
+
+const
+  TestValues: array[0..9] of LongWord = (500, 1, 0, 995, $7FFFFFFF, $80000000, $80000001, $80000002, $FFFFFFFF, 1000000);
+
+const
+  ExpectedResults: array[0..9,1..16] of LongWord = (
+    (0,500,500,0,166,2,0,500,0,500,0,500,0,500,0,500),
+    (0,1,1,0,0,1,0,1,0,1,0,1,0,1,0,1),
+    (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
+    (0,995,995,0,331,2,0,995,0,995,0,995,0,995,0,995),
+    (2147483,647,2147483647,0,715827882,1,524287,4095,1,0,0,2147483647,0,2147483647,0,2147483647),
+    (2147483,648,2147483648,0,715827882,2,524288,0,1,1,1,0,0,2147483648,0,2147483648),
+    (2147483,649,2147483649,0,715827883,0,524288,1,1,2,1,1,1,0,0,2147483649),
+    (2147483,650,2147483650,0,715827883,1,524288,2,1,3,1,2,1,1,0,2147483650),
+    (4294967,295,4294967295,0,1431655765,0,1048575,4095,2,1,1,2147483647,1,2147483646,1,0),
+    (1000,0,1000000,0,333333,1,244,576,0,1000000,0,1000000,0,1000000,0,1000000));
+
+var
+  X, Y, C, Col: LongWord;
+
+procedure DoCheck;
+  begin
+    if Y<>ExpectedResults[C,Col] then
+      begin
+        writeln('Error at ',C,' ',Col);
+        halt(1);
+      end;
+    Inc(Col);
+  end;
+
+begin
+  for C := Low(TestValues) to High(TestValues) do
+  begin
+    X := TestValues[C];
+    Col := 1;
+
+    Y := X div 1000;
+    Write(Y,',');
+    DoCheck;
+
+    Y := X mod 1000;
+    Write(Y,',');
+    DoCheck;
+
+    Y := X div 1;
+    Write(Y,',');
+    DoCheck;
+
+    Y := X mod 1;
+    Write(Y,',');
+    DoCheck;
+
+    Y := X div 3;
+    Write(Y,',');
+    DoCheck;
+
+    Y := X mod 3;
+    Write(Y,',');
+    DoCheck;
+
+    Y := X div $1000;
+    Write(Y,',');
+    DoCheck;
+
+    Y := X mod $1000;
+    Write(Y,',');
+    DoCheck;
+
+    Y := X div $7FFFFFFF;
+    Write(Y,',');
+    DoCheck;
+
+    Y := X mod $7FFFFFFF;
+    Write(Y,',');
+    DoCheck;
+
+    Y := X div $80000000;
+    Write(Y,',');
+    DoCheck;
+
+    Y := X mod $80000000;
+    Write(Y,',');
+    DoCheck;
+
+    Y := X div $80000001;
+    Write(Y,',');
+    DoCheck;
+
+    Y := X mod $80000001;
+    Write(Y,',');
+    DoCheck;
+
+    Y := X div $FFFFFFFF;
+    Write(Y,',');
+    DoCheck;
+
+    Y := X mod $FFFFFFFF;
+    Writeln(Y);
+    DoCheck;
+  end;
+  writeln('ok');
+end.
diff --git a/tests/test/cg/tmoddiv4.pp b/tests/test/cg/tmoddiv4.pp
new file mode 100644
index 0000000000..3ad24e46a2
--- /dev/null
+++ b/tests/test/cg/tmoddiv4.pp
@@ -0,0 +1,110 @@
+
+const
+  TestValues: array[0..10] of QWord = (500, 1, 0, 995, $100000000, $100000001, $7FFFFFFFFFFFFFFF, QWord($8000000000000000), QWord($8000000000000001), QWord($8000000000000002), 1000000);
+
+const
+  ExpectedResults: array[0..10,1..18] of QWord = (
+    (0,500,500,0,166,2,0,500,0,500,0,500,0,500,0,500,0,500),
+    (0,1,1,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1),
+    (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0),
+    (0,995,995,0,331,2,0,995,0,995,0,995,0,995,0,995,0,995),
+    (4294967,296,4294967296,0,1431655765,1,1048576,0,2,2,2,0,0,4294967296,0,4294967296,0,4294967296),
+    (4294967,297,4294967297,0,1431655765,2,1048576,1,2,3,2,1,0,4294967297,0,4294967297,0,4294967297),
+    (9223372036854775,807,9223372036854775807,0,3074457345618258602,1,2251799813685247,4095,4294967298,1,4294967295,2147483647,1,0,0,9223372036854775807,18446744073709551615,0),
+    (9223372036854775,808,9223372036854775808,0,3074457345618258602,2,2251799813685248,0,4294967298,2,4294967296,0,1,1,1,0,1,18446744073709551615),
+    (9223372036854775,809,9223372036854775809,0,3074457345618258603,0,2251799813685248,1,4294967298,3,4294967296,1,1,2,0,9223372036854775809,1,0),
+    (9223372036854775,810,9223372036854775810,0,3074457345618258603,1,2251799813685248,2,4294967298,4,4294967296,2,1,3,0,9223372036854775810,0,9223372036854775810),
+    (1000,0,1000000,0,333333,1,244,576,0,1000000,0,1000000,0,1000000,0,1000000,0,1000000));
+
+var
+  X, Y: QWord;
+  C, Col: LongWord;
+
+procedure DoCheck;
+  begin
+    if Y<>ExpectedResults[C,Col] then
+      begin
+        writeln('Error at ',C,' ',Col);
+        halt(1);
+      end;
+    Inc(Col);
+  end;
+
+begin
+  for C := Low(TestValues) to High(TestValues) do
+  begin
+    X := TestValues[C];
+    Col := 1;
+    Y := QWord(X) div 1000;
+    Write(Y,',');
+    DoCheck;
+
+    Y := QWord(X) mod 1000;
+    Write(Y,',');
+    DoCheck;
+
+    Y := QWord(X) div 1;
+    Write(Y,',');
+    DoCheck;
+
+    Y := QWord(X) mod 1;
+    Write(Y,',');
+    DoCheck;
+
+    Y := QWord(X) div 3;
+    Write(Y,',');
+    DoCheck;
+
+    Y := QWord(X) mod 3;
+    Write(Y,',');
+    DoCheck;
+
+    Y := QWord(X) div $1000;
+    Write(Y,',');
+    DoCheck;
+
+    Y := QWord(X) mod $1000;
+    Write(Y,',');
+    DoCheck;
+
+    Y := QWord(X) div $7FFFFFFF;
+    Write(Y,',');
+    DoCheck;
+
+    Y := QWord(X) mod $7FFFFFFF;
+    Write(Y,',');
+    DoCheck;
+
+    Y := QWord(X) div $80000000;
+    Write(Y,',');
+    DoCheck;
+
+    Y := QWord(X) mod $80000000;
+    Write(Y,',');
+    DoCheck;
+
+    Y := QWord(X) div $7FFFFFFFFFFFFFFF;
+    Write(Y,',');
+    DoCheck;
+
+    Y := QWord(X) mod $7FFFFFFFFFFFFFFF;
+    Write(Y,',');
+    DoCheck;
+
+    Y := QWord(X) div $8000000000000000;
+    Write(Y,',');
+    DoCheck;
+
+    Y := QWord(X) mod $8000000000000000;
+    Write(Y,',');
+    DoCheck;
+
+    Y := QWord(X) div $8000000000000001;
+    Write(Y,',');
+    DoCheck;
+
+    Y := QWord(X) mod $8000000000000001;
+    Writeln(Y);
+    DoCheck;
+  end;
+end.