codetools: SplitStringConstant: added UTF8 support

git-svn-id: trunk@30442 -
2025-08-09 22:15:55 +02:00 · 2011-04-23 22:50:43 +00:00 · 2011-04-23 22:50:43 +00:00 · 2dd6f59bd1
commit 2dd6f59bd1
parent f529fa5679
2 changed files with 84 additions and 17 deletions
--- a/components/codetools/basiccodetools.pas
+++ b/components/codetools/basiccodetools.pas
@ -179,7 +179,7 @@ function StringToPascalConst(const s: string): string;
 // string constants
 function SplitStringConstant(const StringConstant: string;
    FirstLineLength, OtherLineLengths, Indent: integer;
-    const NewLine: string): string;
+    const aLineBreak: string): string;
 procedure ImproveStringConstantStart(const ACode: string; var StartPos: integer);
 procedure ImproveStringConstantEnd(const ACode: string; var EndPos: integer);

@ -4309,7 +4309,7 @@ end;

 function SplitStringConstant(const StringConstant: string;
  FirstLineLength, OtherLineLengths, Indent: integer;
-  const NewLine: string): string;
+  const aLineBreak: string): string;
 { Split long string constants
  If possible it tries to split on word boundaries.

@ -4332,9 +4332,10 @@ const
  stctStart      = 'S'; // ' start char
  stctEnd        = 'E'; // ' end char
  stctWordStart  = 'W'; // word char after non word char
-  stctQuotation1 = 'Q'; // first ' of a double ''
-  stctQuotation2 = 'M'; // second ' of a double ''
+  stctQuotation1 = '1'; // first ' of a double ''
+  stctQuotation2 = '2'; // second ' of a double ''
  stctChar       = 'C'; // normal character
+  stctMBC        = 'M'; // follow character of multi byte char
  stctHash       = '#'; // hash
  stctHashNumber = '0'; // hash number
  stctLineEnd10  = #10; // hash number is 10
@ -4348,17 +4349,34 @@ var
  ParsedSrc: string;
  ParsedLen: integer;
  SplitPos: integer;
+  i: Integer;

  procedure ParseSrc;
  var
    APos: Integer;
+
+    procedure MarkMBC;
+    var
+      l: LongInt;
+    begin
+      l:=UTF8CharacterLength(@Src[APos]);
+      inc(APos);
+      dec(l);
+      while (l>0) and (APos<ParsedLen) do begin
+        ParsedSrc[APos]:=stctMBC;
+        inc(APos);
+        dec(l);
+      end;
+    end;
+
+  var
    NumberStart: Integer;
    Number: Integer;
  begin
-    SetLength(ParsedSrc,CurLineMax+1);
    APos:=1;
    ParsedLen:=CurLineMax+1;
    if ParsedLen>SrcLen then ParsedLen:=SrcLen;
+    SetLength(ParsedSrc,CurLineMax+1);
    while APos<=ParsedLen do begin
      if Src[APos]='''' then begin
        ParsedSrc[APos]:=stctStart;
@ -4376,15 +4394,16 @@ var
              ParsedSrc[APos-1]:=stctEnd;
              break;
            end;
-          end else begin
-            // normal char
-            if (Src[APos] in ['A'..'Z','a'..'z'])
-            and (APos>1)
-            and (ParsedSrc[APos-1]=stctChar)
-            and (not (Src[APos-1] in ['A'..'Z','a'..'z'])) then
-              ParsedSrc[APos]:=stctWordStart
+          end else if Src[APos] in ['A'..'Z','a'..'z',#128..#255] then begin
+            // normal word char
+            if (APos>1) and (Src[APos-1] in ['A'..'Z','a'..'z',#128..#255]) then
+              ParsedSrc[APos]:=stctChar
            else
-              ParsedSrc[APos]:=stctChar;
+              ParsedSrc[APos]:=stctWordStart;
+            MarkMBC;
+          end else begin
+            // other char in string constant
+            ParsedSrc[APos]:=stctWordStart;
            inc(APos);
          end;
        end;
@ -4418,7 +4437,7 @@ var
      end else begin
        // junk
        ParsedSrc[APos]:=stctJunk;
-        inc(APos);
+        MarkMBC;
      end;
    end;
  end;
@ -4444,7 +4463,7 @@ var
    NewSplitPos: Integer;
  begin
    if SplitPos>0 then exit;
-    // check if there is a newline character constant
+    // check if there is a aLineBreak character constant
    HashPos:=SearchCharLeftToRight(stctLineEnd10)-1;
    if (HashPos<1) then begin
      HashPos:=SearchCharLeftToRight(stctLineEnd13)-1;
@ -4528,7 +4547,7 @@ var
      CurIndent:=CurLineMax-10;
    if CurIndent<0 then CurIndent:=0;
    // add indent spaces to Result
-    Result:=Result+NewLine+GetIndentStr(CurIndent)+'+';
+    Result:=Result+aLineBreak+GetIndentStr(CurIndent)+'+';
    // calculate next maximum line length
    CurLineMax:=CurLineMax-CurIndent-1;
  end;
@ -4542,8 +4561,9 @@ begin
  CurLineMax:=FirstLineLength;
  //DebugLn('SplitStringConstant FirstLineLength=',FirstLineLength,
  //' OtherLineLengths=',OtherLineLengths,' Indent=',Indent,' ');
+  i:=0;
  repeat
-    //DebugLn('SrcLen=',SrcLen,' CurMaxLine=',CurLineMax);
+    //DebugLn(['SrcLen=',SrcLen,' CurMaxLine=',CurLineMax]);
    //DebugLn('Src="',Src,'"');
    //DebugLn('Result="',Result,'"');
    if SrcLen<=CurLineMax then begin
@ -4553,12 +4573,21 @@ begin
    end;
    // split line -> search nice split position
    ParseSrc;
+    //debugln(['ParsedSrc=',ParsedSrc]);
    SplitPos:=0;
    SplitAtNewLineCharConstant;
    SplitBetweenConstants;
    SplitAtWordBoundary;
    SplitDefault;
+    if SplitPos<=1 then begin
+      // no split possible
+      Result:=Result+Src;
+      break;
+    end;
+    //debugln(['SplitStringConstant SplitPos=',SplitPos]);
    Split;
+    inc(i);
+    if i>10 then break;
  until false;
  //DebugLn('END Result="',Result,'"');
  //DebugLn('SplitStringConstant END---------------------------------');
--- a/components/codetools/fileprocs.pas
+++ b/components/codetools/fileprocs.pas
@ -278,6 +278,7 @@ function NeedRTLAnsi: boolean;// true if system encoding is not UTF-8
 procedure SetNeedRTLAnsi(NewValue: boolean);
 function UTF8ToSys(const s: string): string;// as UTF8ToAnsi but more independent of widestringmanager
 function SysToUTF8(const s: string): string;// as AnsiToUTF8 but more independent of widestringmanager
+function UTF8CharacterLength(p: PChar): integer;

 // file operations
 function FileExistsUTF8(const Filename: string): boolean;
@ -531,6 +532,43 @@ begin
    Result:=s;
 end;

+function UTF8CharacterLength(p: PChar): integer;
+begin
+  if p<>nil then begin
+    if ord(p^)<%11000000 then begin
+      // regular single byte character (#0 is a character, this is pascal ;)
+      Result:=1;
+    end
+    else if ((ord(p^) and %11100000) = %11000000) then begin
+      // could be 2 byte character
+      if (ord(p[1]) and %11000000) = %10000000 then
+        Result:=2
+      else
+        Result:=1;
+    end
+    else if ((ord(p^) and %11110000) = %11100000) then begin
+      // could be 3 byte character
+      if ((ord(p[1]) and %11000000) = %10000000)
+      and ((ord(p[2]) and %11000000) = %10000000) then
+        Result:=3
+      else
+        Result:=1;
+    end
+    else if ((ord(p^) and %11111000) = %11110000) then begin
+      // could be 4 byte character
+      if ((ord(p[1]) and %11000000) = %10000000)
+      and ((ord(p[2]) and %11000000) = %10000000)
+      and ((ord(p[3]) and %11000000) = %10000000) then
+        Result:=4
+      else
+        Result:=1;
+    end
+    else
+      Result:=1
+  end else
+    Result:=0;
+end;
+
 function FileExistsUTF8(const Filename: string): boolean;
 begin
  Result:=SysUtils.FileExists(UTF8ToSys(Filename));