fpc/compiler/x86/nx86set.pas

{
    $Id$
    Copyright (c) 1998-2002 by Florian Klaempfl

    Generate x86 assembler for in/case nodes

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

 ****************************************************************************
}
unit nx86set;

{$i fpcdefs.inc}

interface

    uses
       node,nset,pass_1,ncgset;

    type

       tx86innode = class(tinnode)
          procedure pass_2;override;
          function pass_1 : tnode;override;
       end;


implementation

    uses
      globtype,systems,
      verbose,globals,
      symconst,symdef,defutil,
      aasmbase,aasmtai,aasmcpu,
      cgbase,pass_2,tgobj,
      ncon,
      cpubase,
      cga,cgobj,cgutils,ncgutil,
      cgx86;

{*****************************************************************************
                              TX86INNODE
*****************************************************************************}

    function tx86innode.pass_1 : tnode;
      begin
         result:=nil;
         { this is the only difference from the generic version }
         expectloc:=LOC_FLAGS;

         firstpass(right);
         firstpass(left);
         if codegenerror then
           exit;

         left_right_max;
         { a smallset needs maybe an misc. register }
         if (left.nodetype<>ordconstn) and
            not(right.location.loc in [LOC_CREGISTER,LOC_REGISTER]) and
            (right.registersint<1) then
           inc(registersint);
      end;


    procedure tx86innode.pass_2;
       type
         Tsetpart=record
           range : boolean;      {Part is a range.}
           start,stop : byte;    {Start/stop when range; Stop=element when an element.}
         end;
       var
         genjumps,
         use_small,
         ranges     : boolean;
         hreg,hreg2,
         pleftreg   : tregister;
         opsize     : tcgsize;
         setparts   : array[1..8] of Tsetpart;
         i,numparts : byte;
         adjustment : longint;
         l,l2       : tasmlabel;
{$ifdef CORRECT_SET_IN_FPC}
         AM         : tasmop;
{$endif CORRECT_SET_IN_FPC}

         function analizeset(Aset:pconstset;is_small:boolean):boolean;
           var
             compares,maxcompares:word;
             i:byte;
           begin
             if tnormalset(Aset^)=[] then
                {The expression...
                    if expr in []
                 ...is allways false. It should be optimized away in the
                 resulttype pass, and thus never occur here. Since we
                 do generate wrong code for it, do internalerror.}
                internalerror(2002072301);
             analizeset:=false;
             ranges:=false;
             numparts:=0;
             compares:=0;
             { Lots of comparisions take a lot of time, so do not allow
               too much comparisions. 8 comparisions are, however, still
               smalller than emitting the set }
             if cs_littlesize in aktglobalswitches then
               maxcompares:=8
             else
               maxcompares:=5;
             { when smallset is possible allow only 3 compares the smallset
               code is for littlesize also smaller when more compares are used }
             if is_small then
               maxcompares:=3;
             for i:=0 to 255 do
              if i in tnormalset(Aset^) then
               begin
                 if (numparts=0) or (i<>setparts[numparts].stop+1) then
                  begin
                  {Set element is a separate element.}
                    inc(compares);
                    if compares>maxcompares then
                         exit;
                    inc(numparts);
                    setparts[numparts].range:=false;
                    setparts[numparts].stop:=i;
                  end
                 else
                  {Set element is part of a range.}
                  if not setparts[numparts].range then
                   begin
                     {Transform an element into a range.}
                     setparts[numparts].range:=true;
                     setparts[numparts].start:=setparts[numparts].stop;
                     setparts[numparts].stop:=i;
                     ranges := true;
                     { there's only one compare per range anymore. Only a }
                     { sub is added, but that's much faster than a        }
                     { cmp/jcc combo so neglect its effect                }
{                     inc(compares);
                     if compares>maxcompares then
                      exit; }
                   end
                  else
                   begin
                    {Extend a range.}
                    setparts[numparts].stop:=i;
                   end;
              end;
             analizeset:=true;
           end;

       begin
         { We check first if we can generate jumps, this can be done
           because the resulttype.def is already set in firstpass }

         { check if we can use smallset operation using btl which is limited
           to 32 bits, the left side may also not contain higher values !! }
         use_small:=(tsetdef(right.resulttype.def).settype=smallset) and
                    ((left.resulttype.def.deftype=orddef) and (torddef(left.resulttype.def).high<=32) or
                     (left.resulttype.def.deftype=enumdef) and (tenumdef(left.resulttype.def).max<=32));

         { Can we generate jumps? Possible for all types of sets }
         genjumps:=(right.nodetype=setconstn) and
                   analizeset(tsetconstnode(right).value_set,use_small);
         { calculate both operators }
         { the complex one first }
         firstcomplex(self);
         secondpass(left);
         { Only process the right if we are not generating jumps }
         if not genjumps then
          begin
            secondpass(right);
          end;
         if codegenerror then
          exit;

         { ofcourse not commutative }
         if nf_swaped in flags then
          swapleftright;

         if genjumps then
          begin
            { It gives us advantage to check for the set elements
              separately instead of using the SET_IN_BYTE procedure.
              To do: Build in support for LOC_JUMP }

            opsize := def_cgsize(left.resulttype.def);
            { If register is used, use only lower 8 bits }
            if left.location.loc in [LOC_REGISTER,LOC_CREGISTER] then
             begin
               { for ranges we always need a 32bit register, because then we }
               { use the register as base in a reference (JM)                }
               if ranges then
                 begin
                   pleftreg:=cg.makeregsize(exprasmlist,left.location.register,OS_INT);
                   cg.a_load_reg_reg(exprasmlist,left.location.size,OS_INT,left.location.register,pleftreg);
                   if opsize<>OS_INT then
                     cg.a_op_const_reg(exprasmlist,OP_AND,OS_INT,255,pleftreg);
                   opsize:=OS_INT;
                 end
               else
                 { otherwise simply use the lower 8 bits (no "and" }
                 { necessary this way) (JM)                        }
                 begin
                   pleftreg:=cg.makeregsize(exprasmlist,left.location.register,OS_8);
                   opsize := OS_8;
                 end;
             end
            else
             begin
               { load the value in a register }
               pleftreg:=cg.getintregister(exprasmlist,OS_32);
               opsize:=OS_32;
               cg.a_load_ref_reg(exprasmlist,OS_8,OS_32,left.location.reference,pleftreg);
             end;

            { Get a label to jump to the end }
            location_reset(location,LOC_FLAGS,OS_NO);

            { It's better to use the zero flag when there are
              no ranges }
            if ranges then
              location.resflags:=F_C
            else
              location.resflags:=F_E;

            objectlibrary.getlabel(l);

            { how much have we already substracted from the x in the }
            { "x in [y..z]" expression                               }
            adjustment := 0;

            for i:=1 to numparts do
             if setparts[i].range then
              { use fact that a <= x <= b <=> cardinal(x-a) <= cardinal(b-a) }
              begin
                { is the range different from all legal values? }
                if (setparts[i].stop-setparts[i].start <> 255) then
                  begin
                    { yes, is the lower bound <> 0? }
                    if (setparts[i].start <> 0) then
                      begin
                        if (left.location.loc = LOC_CREGISTER) then
                          begin
                            hreg:=cg.getintregister(exprasmlist,OS_INT);
                            cg.a_load_reg_reg(exprasmlist,opsize,OS_INT,pleftreg,hreg);
                            pleftreg:=hreg;
                            opsize:=OS_INT;
                          end;
                        cg.a_op_const_reg(exprasmlist,OP_SUB,opsize,setparts[i].start-adjustment,pleftreg);
                      end;

                    { new total value substracted from x:           }
                    { adjustment + (setparts[i].start - adjustment) }
                    adjustment := setparts[i].start;

                    { check if result < b-a+1 (not "result <= b-a", since }
                    { we need a carry in case the element is in the range }
                    { (this will never overflow since we check at the     }
                    { beginning whether stop-start <> 255)                }
                    cg.a_cmp_const_reg_label(exprasmlist,opsize,OC_B,setparts[i].stop-setparts[i].start+1,pleftreg,l);
                  end
                else
                  { if setparts[i].start = 0 and setparts[i].stop = 255,  }
                  { it's always true since "in" is only allowed for bytes }
                  begin
                    exprasmlist.concat(taicpu.op_none(A_STC,S_NO));
                    cg.a_jmp_always(exprasmlist,l);
                  end;
              end
             else
              begin
                { Emit code to check if left is an element }
                exprasmlist.concat(taicpu.op_const_reg(A_CMP,TCGSize2OpSize[opsize],setparts[i].stop-adjustment,
                  pleftreg));
                { Result should be in carry flag when ranges are used }
                if ranges then
                  exprasmlist.concat(taicpu.op_none(A_STC,S_NO));
                { If found, jump to end }
                cg.a_jmp_flags(exprasmlist,F_E,l);
              end;
             if ranges and
                { if the last one was a range, the carry flag is already }
                { set appropriately                                      }
                not(setparts[numparts].range) then
               exprasmlist.concat(taicpu.op_none(A_CLC,S_NO));
             { To compensate for not doing a second pass }
             right.location.reference.symbol:=nil;
             { Now place the end label }
             cg.a_label(exprasmlist,l);
          end
         else
          begin
            location_reset(location,LOC_FLAGS,OS_NO);

            { We will now generated code to check the set itself, no jmps,
              handle smallsets separate, because it allows faster checks }
            if use_small then
             begin
               if left.nodetype=ordconstn then
                begin
                  location.resflags:=F_NE;
                  case right.location.loc of
                    LOC_REGISTER,
                    LOC_CREGISTER:
                      begin
                         emit_const_reg(A_TEST,S_L,
                           1 shl (tordconstnode(left).value and 31),right.location.register);
                      end;
                    LOC_REFERENCE,
                    LOC_CREFERENCE :
                      begin
                        emit_const_ref(A_TEST,S_L,1 shl (tordconstnode(left).value and 31),
                           right.location.reference);
                      end;
                    else
                      internalerror(200203312);
                  end;
                end
               else
                begin
                  case left.location.loc of
                     LOC_REGISTER,
                     LOC_CREGISTER:
                       begin
                          hreg:=cg.makeregsize(exprasmlist,left.location.register,OS_32);
                          cg.a_load_reg_reg(exprasmlist,left.location.size,OS_32,left.location.register,hreg);
                       end;
                  else
                    begin
                      { the set element isn't never samller than a byte
                        and because it's a small set we need only 5 bits
                        but 8 bits are easier to load                    }
                      hreg:=cg.getintregister(exprasmlist,OS_32);
                      cg.a_load_ref_reg(exprasmlist,OS_8,OS_32,left.location.reference,hreg);
                    end;
                  end;

                  case right.location.loc of
                    LOC_REGISTER,
                    LOC_CREGISTER :
                      begin
                        emit_reg_reg(A_BT,S_L,hreg,right.location.register);
                      end;
                     LOC_CONSTANT :
                       begin
                         { We have to load the value into a register because
                            btl does not accept values only refs or regs (PFV) }
                         hreg2:=cg.getintregister(exprasmlist,OS_32);
                         cg.a_load_const_reg(exprasmlist,OS_32,right.location.value,hreg2);
                         emit_reg_reg(A_BT,S_L,hreg,hreg2);
                       end;
                     LOC_CREFERENCE,
                     LOC_REFERENCE :
                       begin
                         emit_reg_ref(A_BT,S_L,hreg,right.location.reference);
                       end;
                     else
                       internalerror(2002032210);
                  end;
                  location.resflags:=F_C;
                end;
             end
            else
             begin
               if right.location.loc=LOC_CONSTANT then
                begin
                  location.resflags:=F_C;
                  objectlibrary.getlabel(l);
                  objectlibrary.getlabel(l2);

                  { load constants to a register }
                  if left.nodetype=ordconstn then
                    location_force_reg(exprasmlist,left.location,OS_INT,true);

                  case left.location.loc of
                     LOC_REGISTER,
                     LOC_CREGISTER:
                       begin
                          hreg:=cg.makeregsize(exprasmlist,left.location.register,OS_32);
                          cg.a_load_reg_reg(exprasmlist,left.location.size,OS_32,left.location.register,hreg);
                          cg.a_cmp_const_reg_label(exprasmlist,OS_32,OC_BE,31,hreg,l);
                          { reset carry flag }
                          exprasmlist.concat(taicpu.op_none(A_CLC,S_NO));
                          cg.a_jmp_always(exprasmlist,l2);
                          cg.a_label(exprasmlist,l);
                          { We have to load the value into a register because
                            btl does not accept values only refs or regs (PFV) }
                          hreg2:=cg.getintregister(exprasmlist,OS_32);
                          cg.a_load_const_reg(exprasmlist,OS_32,right.location.value,hreg2);
                          emit_reg_reg(A_BT,S_L,hreg,hreg2);
                       end;
                  else
                    begin
{$ifdef CORRECT_SET_IN_FPC}
                          if m_tp in aktmodeswitches then
                            begin
                              {***WARNING only correct if
                                reference is 32 bits (PM) *****}
                               emit_const_ref(A_CMP,S_L,31,reference_copy(left.location.reference));
                            end
                          else
{$endif CORRECT_SET_IN_FPC}
                            begin
                               emit_const_ref(A_CMP,S_B,31,left.location.reference);
                            end;
                       cg.a_jmp_flags(exprasmlist,F_BE,l);
                       { reset carry flag }
                       exprasmlist.concat(taicpu.op_none(A_CLC,S_NO));
                       cg.a_jmp_always(exprasmlist,l2);
                       cg.a_label(exprasmlist,l);
                       hreg:=cg.getintregister(exprasmlist,OS_32);
                       cg.a_load_ref_reg(exprasmlist,OS_32,OS_32,left.location.reference,hreg);
                       { We have to load the value into a register because
                         btl does not accept values only refs or regs (PFV) }
                       hreg2:=cg.getintregister(exprasmlist,OS_32);
                       cg.a_load_const_reg(exprasmlist,OS_32,right.location.value,hreg2);
                       emit_reg_reg(A_BT,S_L,hreg,hreg2);
                    end;
                  end;
                  cg.a_label(exprasmlist,l2);
                end { of right.location.loc=LOC_CONSTANT }
               { do search in a normal set which could have >32 elementsm
                 but also used if the left side contains higher values > 32 }
               else if left.nodetype=ordconstn then
                begin
                  location.resflags:=F_NE;
                  inc(right.location.reference.offset,tordconstnode(left).value shr 3);
                  emit_const_ref(A_TEST,S_B,1 shl (tordconstnode(left).value and 7),right.location.reference);
                end
               else
                begin
                  if (left.location.loc=LOC_REGISTER) then
                    pleftreg:=cg.makeregsize(exprasmlist,left.location.register,OS_32)
                  else
                    pleftreg:=cg.getintregister(exprasmlist,OS_32);
                  cg.a_load_loc_reg(exprasmlist,OS_32,left.location,pleftreg);
                  location_freetemp(exprasmlist,left.location);
                  emit_reg_ref(A_BT,S_L,pleftreg,right.location.reference);
                  { tg.ungetiftemp(exprasmlist,right.location.reference) happens below }
                  location.resflags:=F_C;
                end;
             end;
          end;
          if not genjumps then
            location_freetemp(exprasmlist,right.location);
       end;

begin
   cinnode:=tx86innode;
end.
{
  $Log$
  Revision 1.8  2004-10-31 21:45:04  peter
    * generic tlocation
    * move tlocation to cgutils

  Revision 1.7  2004/10/24 20:10:08  peter
    * -Or fixes

  Revision 1.6  2004/10/01 17:32:16  peter
    * fix resizing of LOC_CREGISTER

  Revision 1.5  2004/09/25 14:23:55  peter
    * ungetregister is now only used for cpuregisters, renamed to
      ungetcpuregister
    * renamed (get|unget)explicitregister(s) to ..cpuregister
    * removed location-release/reference_release

  Revision 1.4  2004/06/16 20:07:11  florian
    * dwarf branch merged

  Revision 1.3  2004/05/22 23:34:28  peter
  tai_regalloc.allocation changed to ratype to notify rgobj of register size changes

  Revision 1.2.2.1  2004/04/28 18:35:42  peter
    * cardinal fixes for x86-64

  Revision 1.2  2004/02/27 10:21:06  florian
    * top_symbol killed
    + refaddr to treference added
    + refsymbol to treference added
    * top_local stuff moved to an extra record to save memory
    + aint introduced
    * tppufile.get/putint64/aint implemented

  Revision 1.1  2004/02/22 12:04:04  florian
    + nx86set added
    * some more x86-64 fixes
 }