LazReport, fix word wrapping/breaking of UTF-8 strings

git-svn-id: trunk@23057 -
2025-08-10 18:36:25 +02:00 · 2009-12-10 07:42:34 +00:00 · 2009-12-10 07:42:34 +00:00 · b5ca260573
commit b5ca260573
parent 3ccb54ad89
2 changed files with 168 additions and 68 deletions
--- a/components/lazreport/source/lr_class.pas
+++ b/components/lazreport/source/lr_class.pas
@ -2449,7 +2449,7 @@ begin
          CanBreak := True;
      end;
      if CanBreak then
-        Result := Result + Chr(UTF8CharToByteIndex(pchar(s),length(s),i));
+        Result := Result + Chr(i);
      Inc(i);
    until i > Len - 2;
  end;
@ -2463,6 +2463,7 @@ var
  size, size1, maxwidth: Integer;
  b: TWordBreaks;
  WCanvas: TCanvas;
+  desc: string;

  procedure OutLine(const str: String);
  var
@ -2481,9 +2482,11 @@ var

  procedure WrapLine(const s: String);
  var
-    i, cur, beg, last: Integer;
+    i, cur, beg, last, len: Integer;
    WasBreak, CRLF: Boolean;
+    ch: TUTF8char;
  begin
+
    CRLF := False;
    for i := 1 to Length(s) do
    begin
@ -2493,108 +2496,107 @@ var
        break;
      end;
    end;
+
    last := 1; beg := 1;
-    {$IFDEF DebugLR}
-    debugLn('        WrapLine: init "',dbgstr(s),'" wcanvas.txtw=',dbgs(WCanvas.TextWidth(s)),
-      ' maxwidth=',dbgs(maxwidth),' crlf=',dbgs(crlf));
-    {$ENDIF}
    if not CRLF and ((Length(s) <= 1) or (WCanvas.TextWidth(s) <= maxwidth)) then
    begin
-      {$IFDEF DebugLR}
-      debugLn('        WrapLine: fast crlf=',dbgs(crlf),' wcanvas.txtw=',dbgs(Wcanvas.TextWidth(s)));
-      {$ENDIF}
      OutLine(s + #1)
    end else
    begin
+
      cur := 1;
-      while cur <= Length(s) do
+      Len := UTF8Desc(S, Desc);
+
+      while cur <= Len do
      begin
-        if s[cur] in [#10, #13] then
+        Ch := UTF8Char(s, cur, Desc);
+
+        // check for items with soft-breaks
+        if (Ch=#10) or (Ch=#13) then
        begin
-          {$IFDEF DebugLR}
-          debugLn('        Wrapline: crlf beg=',dbgs(beg),' cur=',dbgs(cur));
-          {$ENDIF}
-          OutLine(Copy(s, beg, cur - beg) + #1);
-          while (cur < Length(s)) and (s[cur] in [#10, #13]) do Inc(cur);
+          OutLine(UTF8Range(s, beg, cur - beg, Desc) + #1);
+          while (cur<Len) and UTF8CharIn(ch, [#10,#13]) do
+          begin
+            inc(cur);
+            if cur<=len then
+              ch := UTF8Char(s, cur, desc);
+          end;
+
          beg := cur; last := beg;
-          if s[cur] in [#13, #10] then begin
-            {$IFDEF DebugLR}
-            debugln('        Wrapline: Exiting as cur is in crlf cur=',dbgs(cur));
-            {$ENDIF}
-            Exit;
-          end else
+          if UTF8CharIn(Ch,[#13, #10]) then
+            exit
+          else
            continue;
        end;
-        if s[cur] <> ' ' then
-        if WCanvas.TextWidth(Copy(s, beg, cur - beg + 1)) > maxwidth then
+
+        if ch <> ' ' then
+        if WCanvas.TextWidth(UTF8Range(s, beg, cur - beg + 1, Desc)) > maxwidth then
        begin
+
          WasBreak := False;
          if (Flags and flWordBreak) <> 0 then
          begin
+
+            // in case of breaking in the middle, get the full word
            i := cur;
-            while (i <= Length(s)) and not (s[i] in spaces) do
+            while (i <= Len) and not UTF8CharIn(ch, [' ', '.', ',', '-']) do
+            begin
              Inc(i);
-            {$IFDEF DebugLR}
-            debugln('        wrapline: to BreakWord: ini=',dbgs(last+1),' fin=',dbgs(i-last-1));
-            {$ENDIF}
-            b := BreakWord(Copy(s, last + 1, i - last - 1));
+              if i<=len then
+                ch := UTF8Char(s, i, Desc);
+            end;
+
+            // find word's break points using some simple hyphenator algorithm
+            // TODO: implement interface so users can use their own hyphenator
+            //       algorithm
+            b := BreakWord(UTF8Range(s, last, i - last, Desc));
+
+            // if word can be broken in many segments, find the last segment that
+            // fits within maxwidth
            if Length(b) > 0 then
            begin
              i := 1;
-              cur := last;
              while (i <= Length(b)) and
-                (WCanvas.TextWidth(Copy(s, beg, last - beg + 1 + Ord(b[i])) + '-') <= maxwidth) do
+                (WCanvas.TextWidth(UTF8Range(s, beg, last - beg + Ord(b[i]), Desc) + '-') <= maxwidth) do
              begin
                WasBreak := True;
-                cur := last + Ord(b[i]);
+                cur := last + Ord(b[i]);  // cur now points to next char after breaking word
                Inc(i);
              end;
-              last := cur;
            end;
+
+            // last now points to nex char to be processed
+            last := cur;
          end
          else
-            if last = beg then last := cur;
-          if WasBreak then begin
-            {$IFDEF DebugLR}
-            debugln('        wrapline: wasbreak=true, beg=',dbgs(beg),' fin=',dbgs(last-beg+1));
-            {$ENDIF}
-            OutLine(Copy(s, beg, last - beg + 1) + '-');
-          end else if s[last] = ' ' then begin
-                 {$IFDEF DebugLR}
-                 debugln('        wrapline: s[last]=" ", beg=',dbgs(beg),' fin=',dbgs(last-beg));
-                 {$ENDIF}
-                 OutLine(Copy(s, beg, last - beg))
-               end else begin
-                  {$IFDEF DebugLR}
-                  debugln('        wrapline: s[last]<>" ", beg=',dbgs(beg),' fin=',dbgs(last-beg));
-                  {$ENDIF}
-                  OutLine(Copy(s, beg, last - beg + 1));
-               end;
-          if ((Flags and flWordBreak) <> 0) and not WasBreak and (last = cur) then
          begin
-            beg := cur + 1;
-            cur := Length(s);
-            break;
+            if last = beg then
+              last := cur;
          end;
-          if (Flags and flWordBreak) = 0 then
+
+          if WasBreak then
          begin
-            if last = cur then
-            begin
-              beg := cur;
-              break;
-            end;
+            // if word has been broken, output the partial word plus an hyphen
+            OutLine(UTF8Range(s, beg, last - beg, Desc) + '-');
+          end else
+          begin
+            // output the portion of word that fits maxwidth
+            OutLine(UTF8Range(s, beg, last - beg, Desc));
+            // if space was found, advance to next no space char
+            if s[last] = ' ' then
+              last := last + 1;
          end;
-          beg := last + 1; last := beg;
+
+          beg := last;
        end;
-        if s[cur] in spaces then last := cur;
+
+        if UTF8CharIn(Ch, [' ', '.', ',', '-']) then
+          last := cur;
        Inc(cur);
      end;
-      if beg <> cur then begin
-        {$IFDEF DebugLR}
-        debugln('        wrapline: beg<>cur, beg=',dbgs(beg),' fin=',dbgs(cur-beg+1));
-        {$ENDIF}
-        OutLine(Copy(s, beg, cur - beg + 1) + #1);
-      end;
+
+      if beg <> cur then
+        OutLine(UTF8Range(s, beg, cur - beg + 1, Desc) + #1);
    end;
  end;

--- a/components/lazreport/source/lr_utils.pas
+++ b/components/lazreport/source/lr_utils.pas
@ -53,10 +53,20 @@ function StrTofrTypeObject(St : string) : Byte;

 function lrGetUnBrackedStr(const S:string):string; //remove '[' from begion of string and ']' from end

+// utf8 tools
+function UTF8Desc(S:string; var Desc: string): Integer;
+function UTF8Char(S:string; index:Integer; Desc:string): TUTF8Char;
+function UTF8Range(S:string; index,count:Integer; Desc:String):string;
+function UTF8Index(S:string; index:integer; desc:string): Integer;
+function UTF8CharIn(ch:TUTF8Char; const arrstr:array of string): boolean;
+
 implementation

 uses LR_Class;

+var
+  LocalDescri: string;
+
 procedure frInitFont(aFont : TFont; aColor : TColor; aSize : Integer; aStyle : TFontStyles);
 begin
  with aFont do
@ -530,4 +540,92 @@ begin
  Result:=S;
 end;

+function UTF8Desc(S: string; var Desc: string): Integer;
+var
+    i,b: Integer;
+begin
+  i := 1;
+  Result := 0;
+  SetLength(Desc, Length(S));
+  while i<=Length(s) do begin
+    b := UTF8CharacterStrictLength(@S[i]);
+    inc(i,b);
+    inc(Result);
+    Desc[Result] := Char(b);
+  end;
+  Setlength(Desc, Result);
+end;
+
+function UTF8Char(S: string; index: Integer; Desc: string): TUTF8Char;
+var
+  i,j: Integer;
+begin
+  Result := '';
+  if (index<1) or (index>Length(Desc)) then begin
+    //Result := #$EF#$BF#$BD  // replacement character
+    exit;
+  end;
+
+  i:=0; j:=1;
+  while i<Length(Desc) do begin
+    inc(i);
+    if i=index then begin
+      Move(S[j],Result[1],ord(Desc[i]));
+      Result[0]:=Desc[i];
+      break;
+    end;
+    inc(j, ord(Desc[i]));
+  end;
+
+end;
+
+function UTF8Range(S: string; index, count: Integer; Desc: String
+  ): string;
+var
+  c,i: Integer;
+begin
+  result := '';
+  c := 0;
+  i := index;
+  while (Count>0) and (i<=Length(Desc)) do begin
+    c := c + ord(Desc[i]);
+    inc(i);
+    Dec(Count);
+  end;
+  i := UTF8Index(S, Index, Desc);
+  if i>0 then begin
+    SetLength(Result, c);
+    Move(S[i],Result[1],c);
+  end;
+end;
+
+// this assume index is in valid range
+function UTF8Index(S: string; index: integer; desc: string): Integer;
+var
+  i,c: integer;
+begin
+  result := 0;
+  c := 0;
+  for i:=1 to Length(Desc) do begin
+    inc(c);
+    if i=index then begin
+      result := c;
+      break;
+    end;
+    c := c + ord(Desc[i]) - 1;
+  end;
+end;
+
+function UTF8CharIn(ch:TUTF8Char; const arrstr: array of string): boolean;
+var
+  i: Integer;
+begin
+  result := false;
+  for i:=low(arrstr) to high(arrstr) do
+    if arrstr[i]=ch then begin
+      result := true;
+      break;
+    end;
+end;
+
 end.