From 9788b01d3113a43e9c4730e567f3a711f65ecf67 Mon Sep 17 00:00:00 2001
From: Jonas Maebe <jonas@freepascal.org>
Date: Sat, 14 Mar 2015 18:35:47 +0000
Subject: [PATCH]   * rewrote most of the special case handling of parameter
 passing on ppc64,     documenting the pecularities of the various calling
 conventions and     marking what we do and don't support currently   * also
 handle arrays for the ELFv2 ABI when determining whether an aggregate    
 only contains floating values of a single type

git-svn-id: trunk@30203 -
---
 compiler/powerpc64/cpupara.pas | 241 +++++++++++++++++++++++++--------
 compiler/powerpc64/symcpu.pas  |  29 ++++
 2 files changed, 215 insertions(+), 55 deletions(-)

diff --git a/compiler/powerpc64/cpupara.pas b/compiler/powerpc64/cpupara.pas
index f4d67dcfe2..65f06490cd 100644
--- a/compiler/powerpc64/cpupara.pas
+++ b/compiler/powerpc64/cpupara.pas
@@ -311,17 +311,22 @@ end;
 
 procedure tppcparamanager.create_paraloc_for_def(var para: TCGPara; varspez: tvarspez; paradef: tdef; var nextfloatreg, nextintreg: tsuperregister; var stack_offset: longint; const isVararg, forceintmem: boolean; const side: tcallercallee; const p: tabstractprocdef);
 var
-  adjusttail: boolean;
-  firstparaloc: boolean;
   paracgsize: tcgsize;
   loc: tcgloc;
   paraloc: pcgparalocation;
+  { def to use for all paralocs if <> nil }
+  alllocdef,
+  { def to use for the current paraloc }
   locdef,
   tmpdef: tdef;
   paralen: aint;
   fsym: tfieldvarsym;
   parashift: byte;
+  tailpadding,
+  firstparaloc,
+  paraaligned: boolean;
 begin
+  alllocdef:=nil;
   locdef:=nil;
   parashift := 0;
   para.reset;
@@ -335,48 +340,134 @@ begin
       paralen := paradef.size
     else
       paralen := tcgsize2size[def_cgsize(paradef)];
-    if (paradef.typ = recorddef) and
-      (varspez in [vs_value, vs_const]) then begin
-      { if a record has only one field and that field is }
-      { non-composite (not array or record), it must be  }
-      { passed according to the rules of that type.       }
-      if tabstractrecordsymtable(tabstractrecorddef(paradef).symtable).has_single_field(fsym) and
-        ((fsym.vardef.typ = floatdef) or
-         (not(target_info.system in systems_aix) and
-          (fsym.vardef.typ in [orddef, enumdef]))) then begin
-        paradef := fsym.vardef;
-        loc := getparaloc(paradef);
-        paracgsize := def_cgsize(paradef)
-      { With the new ABI, so-called "homogeneous" aggregates, i.e. struct, arrays,
-        or unions that (recursively) contain only elements of the same floating-
-        point or vector type are passed as if those elements were passed as
-        separate arguments.  (This is done for up to 8 such elements.) }
-      end else if (target_info.abi=abi_powerpc_elfv2) and
-         tcpurecorddef(paradef).has_single_type_elfv2(tmpdef) and
-         ((8*tmpdef.size)<=paradef.size) then begin
-          locdef := tmpdef;
-          loc := getparaloc(locdef);
-          paracgsize := def_cgsize(locdef);
-      end else begin
-        loc := LOC_REGISTER;
-        paracgsize := int_cgsize(paralen);
-        if (paralen in [3, 5, 6, 7]) then
-          parashift := (8-paralen) * 8;
-      end;
-    end else begin
-      loc := getparaloc(paradef);
-      paracgsize := def_cgsize(paradef);
-      { for things like formaldef }
-      if (paracgsize = OS_NO) then begin
-        paracgsize := OS_ADDR;
-        paralen := tcgsize2size[OS_ADDR];
-      end;
-    end
+    { default rules:
+      * integer parameters sign/zero-extended to 64 bit
+      * floating point register used -> skip equivalent GP register
+      * floating point parameters passed as is (32/64 bit)
+      * floating point parameters to variable arguments -> in int registers
+      * aggregates passed in consecutive integer registers
+      * all *aggregate* data in integer registers exactly mirrors the data
+        in memory -> on big endian it's left aligned (passed in most
+        significant part of the 64 bit word if it's < 64 bit), on little
+        endian it's right aligned (least significant part of the 64 bit
+        word)
+
+      special rules:
+
+implemented
+   |
+   | * AIX/ELFv1/SysV ppc64 ABI (big endian only):
+   x    a) single precision floats are stored in the second word of a 64 bit
+           location when passed on the stack
+   x    b) aggregate with 1 floating point element passed like a floating
+           point parameter of the same size
+   x    c) aggregates smaller than 64 bit are aligned in least significant bits
+           of a single 64bit location (incl. register) (AIX exception: it puts
+           them in the most significant bits)
+
+      * ELFv2 ppc64 ABI:
+   x    a) so-called "homogeneous" aggregates, i.e. struct, arrays, or unions
+           that (recursively) contain only elements of the same floating-
+           point or vector type, are passed as if those elements were passed as
+           separate arguments. This is done for up to 8 such elements.
+   x    b) other than a), it's the same as the AIX ppc64 ABI
+
+      * Darwin ppc64 ABI:
+
+      - as in the general case, aggregates in registers mirror their place in
+        memory, so if e.g. a struct starts with a 32 bit integer, it's
+        placed in the upper 32 bits of a the corresponding register. A plain
+        32 bit integer para is however passed in the lower 32 bits, since it
+        is promoted to a 64 bit int first (see below)
+
+   x    a) aggregates with sizes 1, 2 and 4 bytes are padded with 0s on the left
+          (-> aligned in least significant bits of 64 bit word on big endian) to
+          a multiple of *4 bytes* (when passed by memory, don't occupy 8 bytes)
+   x    b) other aggregates are padded with 0s on the right (-> aligned in most
+           signifcant bits of 64 bit word of integer register) to a multiple of
+           *4 bytes*
+   x    c) all floating pointer parameters (not in aggregates) are promoted to
+           double (doesn't seem to be correct: 8 bytes are reserved in the
+           stack frame, but the compiler still stores a single in it (in the
+           lower 4 bytes -- like with SysV a) )
+   x    d) all integer parameters (not in aggregates) are promoted to 64 bit
+  (x)   e) aggregates (incl. arrays) of exactly 16 bytes passed in two integer
+           registers
+        f) floats in *structures without unions* are processed per rule c)
+           (similar for vector fields)
+        g) other fields in *structures without unions* are processed
+           recursively according to e) / f) if they are aggragates, and h)
+           otherwise (i.e, without promotion!)
+  (x)   h) everything else (structures with unions and size<>16, arrays with
+           size<>16, ...) is passed "normally" in integer registers
+    }
+    { should the tail be shifted into the most significant bits? }
+    tailpadding:=false;
+    { have we ensured that the next parameter location will be aligned to the
+      next 8 byte boundary? }
+    paraaligned:=false;
+    { ELFv2 a) }
+    if (target_info.abi=abi_powerpc_elfv2) and
+       (((paradef.typ=recorddef) and
+         tcpurecorddef(paradef).has_single_type_elfv2(tmpdef)) or
+        ((paradef.typ=arraydef) and
+         tcpuarraydef(paradef).has_single_type_elfv2(tmpdef))) and
+       (tmpdef.typ=floatdef { or vectordef }) and
+       (paradef.size<=(8*tmpdef.size)) then
+      begin
+        alllocdef:=tmpdef;
+        loc:=getparaloc(alllocdef);
+        paracgsize:=def_cgsize(paradef);
+      end
+    { AIX/ELFv1 b) }
+    else if (target_info.abi in [abi_powerpc_aix,abi_powerpc_sysv]) and
+       (paradef.typ=recorddef) and
+       tabstractrecordsymtable(tabstractrecorddef(paradef).symtable).has_single_field(fsym) and
+       (fsym.vardef.typ=floatdef) then
+      begin
+        paradef:=fsym.vardef;
+        loc:=getparaloc(paradef);
+        paracgsize:=def_cgsize(paradef)
+      end
+    else if (((paradef.typ=arraydef) and not
+         is_special_array(paradef)) or
+        (paradef.typ=recorddef)) then
+      begin
+        { should handle Darwin f/g/h) now, but can't model that yet }
+
+        { general rule: aggregate data is aligned in the most significant bits
+          except for ELFv1 c) and Darwin a) }
+        if (target_info.endian=endian_big) and
+           ((target_info.abi in [abi_powerpc_aix,abi_powerpc_elfv2]) or
+            ((target_info.abi=abi_powerpc_sysv) and
+             (paralen>8)) or
+            ((target_info.abi=abi_powerpc_darwin) and
+             not(paralen in [1,2,4]))) then
+          tailpadding:=true
+        { if we don't add tailpadding on the caller side, the callee will have
+          to shift the value in the register before it can store it to memory }
+        else if (target_info.endian=endian_big) and
+           (paralen in [3,5,6,7]) then
+          parashift:=(8-paralen)*8;
+        { general fallback rule: pass aggregate types in integer registers
+          without special adjustments (incl. Darwin h) }
+        loc:=LOC_REGISTER;
+        paracgsize:=int_cgsize(paralen);
+      end
+    else
+      begin
+        loc:=getparaloc(paradef);
+        paracgsize:=def_cgsize(paradef);
+        { for things like formaldef }
+        if (paracgsize=OS_NO) then
+          begin
+            paracgsize:=OS_ADDR;
+            paralen:=tcgsize2size[OS_ADDR];
+          end;
+      end
   end;
 
-  { patch FPU values into integer registers if we currently have
-   to pass them as vararg parameters
-  }
+  { patch FPU values into integer registers if we are processing varargs }
   if (isVararg) and (paradef.typ = floatdef) then begin
     loc := LOC_REGISTER;
     if paracgsize = OS_F64 then
@@ -385,6 +476,41 @@ begin
       paracgsize := OS_32;
   end;
 
+  { AIX/SysV a), Darwin c) -> skip 4 bytes in the stack frame }
+ if (target_info.endian=endian_big) and
+    (paradef.typ=floatdef) and
+    (tfloatdef(paradef).floattype=s32real) and
+    (nextfloatreg>RS_F13) then
+   begin
+     inc(stack_offset,4);
+     paraaligned:=true;
+   end;
+
+ { Darwin d) }
+  if (target_info.abi=abi_powerpc_darwin) and
+     (paradef.typ in [orddef,enumdef]) and
+     (paralen<8) and
+     { we don't have to sign/zero extend the lower 8/16/32 bit on the callee
+       side since it's done on the caller side; however, if the value is
+       passed via memory, we do have to modify the stack offset since this
+       is big endian and otherwise we'll load/store the wrong bytes) }
+     ((side=callerside) or
+      forceintmem or
+      (nextintreg>RS_R10)) then
+     begin
+      if side=callerside then
+        begin
+          paralen:=8;
+          paradef:=s64inttype;
+          paracgsize:=OS_S64;
+        end
+      else
+        begin
+          inc(stack_offset,8-paralen);
+          paraaligned:=true;
+        end;
+    end;
+
   para.alignment := std_param_align;
   para.size := paracgsize;
   para.intsize := paralen;
@@ -395,9 +521,13 @@ begin
       paraloc^.loc := LOC_VOID;
     end else
       internalerror(2005011310);
-  adjusttail:=paralen>8;
-  if not assigned(locdef) then
-    locdef:=paradef;
+  if not assigned(alllocdef) then
+    locdef:=paradef
+  else
+    begin
+      locdef:=alllocdef;
+      paracgsize:=def_cgsize(locdef);
+    end;
   firstparaloc:=true;
   { can become < 0 for e.g. 3-byte records }
   while (paralen > 0) do begin
@@ -411,20 +541,18 @@ begin
       paraloc^.shiftval := parashift;
 
       { make sure we don't lose whether or not the type is signed }
-      if (paracgsize <> OS_NO) and (paradef.typ <> orddef) then
+      if (paracgsize <> OS_NO) and
+         (paradef.typ <> orddef) and
+         not assigned(alllocdef) then
         begin
           paracgsize := int_cgsize(paralen);
           locdef:=get_paraloc_def(paradef, paralen, firstparaloc);
         end;
 
-      { aix requires that record data (including partial data) stored in
-        parameter registers is left-aligned. Other targets only do this if
-        the total size of the parameter was > 8 bytes. }
-      if (target_info.endian=endian_big) and
-         ((((target_info.system in systems_aix) and
-            (paradef.typ = recorddef)) or
-           adjusttail) and
-          (paralen < sizeof(aint))) then
+      { Partial aggregate data may have to be left-aligned. If so, add tail
+        padding }
+      if tailpadding and
+         (paralen < sizeof(aint)) then
         begin
           paraloc^.shiftval := (sizeof(aint)-paralen)*(-8);
           paraloc^.size := OS_INT;
@@ -499,7 +627,10 @@ begin
       paraloc^.reference.offset := stack_offset;
 
       { align temp contents to next register size }
-      inc(stack_offset, align(paralen, 8));
+      if not paraaligned then
+        inc(stack_offset, align(paralen, 8))
+      else
+        inc(stack_offset, paralen);
       paralen := 0;
     end;
     firstparaloc:=false;
diff --git a/compiler/powerpc64/symcpu.pas b/compiler/powerpc64/symcpu.pas
index 12e14ca87f..479026934c 100644
--- a/compiler/powerpc64/symcpu.pas
+++ b/compiler/powerpc64/symcpu.pas
@@ -79,6 +79,8 @@ type
   tcpuclassrefdefclass = class of tcpuclassrefdef;
 
   tcpuarraydef = class(tarraydef)
+    { see tcpurecorddef.has_single_type_elfv2 }
+    function has_single_type_elfv2(out def: tdef): boolean;
   end;
   tcpuarraydefclass = class of tcpuarraydef;
 
@@ -218,6 +220,33 @@ implementation
         result:=true;
     end;
 
+
+  { tcpuarraydef }
+
+  function tcpuarraydef.has_single_type_elfv2(out def: tdef): boolean;
+    var
+      checkdef: tdef;
+    begin
+      result:=false;
+      checkdef:=self;
+      while (checkdef.typ=arraydef) and
+            not is_special_array(checkdef) do
+        checkdef:=tarraydef(checkdef).elementdef;
+      case checkdef.typ of
+        recorddef:
+          result:=tcpurecorddef(checkdef).has_single_type_elfv2(def);
+        floatdef:
+          begin
+            def:=checkdef;
+            result:=true;
+            exit;
+          end;
+        else
+          exit;
+        end;
+    end;
+
+
 begin
   { used tdef classes }
   cfiledef:=tcpufiledef;