Merged revision(s) 51970 #ae57408e17, 51972-51973 #34be9ae2d0-#34be9ae2d0 from trunk:

docs: add fugue icons to acknowledgements ........ LazUtf8: implement Utf8TryFindCodepointStart. ........ LazUtf8: Refactor UTF8FindNearestCharStart. Resolves Issue #0029851. ........ git-svn-id: branches/fixes_1_6@51995 -
2025-04-07 01:57:57 +02:00 · 2016-03-19 13:32:54 +00:00 · 2016-03-19 13:32:54 +00:00 · 7c789e1ef4
commit 7c789e1ef4
parent d1619263b1
2 changed files with 73 additions and 33 deletions
--- a/components/lazutils/lazutf8.pas
+++ b/components/lazutils/lazutf8.pas
@ -78,6 +78,8 @@ function UTF8ToDoubleByteString(const s: string): string;
 function UTF8ToDoubleByte(UTF8Str: PChar; Len: PtrInt; DBStr: PByte): PtrInt;
 function UTF8FindNearestCharStart(UTF8Str: PChar; Len: SizeInt;
                                  BytePos: SizeInt): SizeInt;
+function Utf8TryFindCodepointStart(AString: PChar; var CurPos: PChar; out CharLen: Integer): Boolean;
+function Utf8TryFindCodepointStart(const AString: String; var Index: Integer; out CharLen: Integer): Boolean;
 // find the n-th UTF8 character, ignoring BIDI
 function UTF8CharStart(UTF8Str: PChar; Len, CharIndex: PtrInt): PChar;
 // find the byte index of the n-th UTF8 character, ignoring BIDI (byte len of substr)
@ -579,46 +581,81 @@ begin
  end;
 end;

-{ Find the start of the UTF8 character which contains BytePos,
-  Len is length in byte, BytePos starts at 0 }
-function UTF8FindNearestCharStart(UTF8Str: PChar; Len: SizeInt; BytePos: SizeInt
-  ): SizeInt;
+
+{ Tries to find the start of a valid UTF8 codepoint that contains the character pointed to by CurPos
+  - AString: pointer to the (start of the) string
+  - CurPos: pointer to the character inside AString that we want to get the information off
+    * if the function succeeds, CurPos wil point to the start of the valid UTF8 codepoint
+    * if the function fails, CurPos will not be changed
+    Note: if CurPos points beyond the end of AString you will get a crash!
+  - CharLen: the length of the UTF8 codepoint in bytes, if the function succeeds
+  - Returns:
+    True if the character pointed to by Curpos is part of a valid UTF8 codepoint (1 to 4 bytes),
+    otherwise it returns False.                                                                          }
+function Utf8TryFindCodepointStart(AString: PChar; var CurPos: PChar; out CharLen: Integer): Boolean;
+var
+  SavedPos: PChar;
 begin
-  Result:=0;
-  if (UTF8Str<>nil) and (Len>0) and (BytePos>=0) then begin
-    Result:=BytePos;
-    if Result>Len then Result:=Len-1;
-    if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
-      dec(Result);
-      if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
-        dec(Result);
-        if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
-          dec(Result);
-          // should be four byte character
-          if (ord(UTF8Str[Result]) and %11111000<>%11110000) then begin
-            // broken UTF8 character
-            inc(Result,3);
-          end else begin
-            // is four byte character
-          end;
-        end else if (ord(UTF8Str[Result]) and %11110000<>%11100000) then begin
-          // broken UTF8 character, should be three byte
-          inc(Result,2);
-        end else
-        begin
-          // is three byte character
-        end;
-      end else if (ord(UTF8Str[Result]) and %11100000<>%11000000) then begin
-        // broken UTF8 character, should be two byte
-        inc(Result);
-      end else
+  Result := False;
+  CharLen := 0;
+  if (not (Assigned(AString) and Assigned(CurPos)))
+      or (CurPos < AString) then Exit;
+  SavedPos := CurPos;
+  //Note: UTF8CharacterStrictLength will NOT "look" beyond the terminating #0 of a PChar, so this is safe with AnsiStrings
+  CharLen := UTF8CharacterStrictLength(CurPos);
+  if (CharLen > 0) then Exit(True);
+  if (CurPos > AString) then
+  begin
+    Dec(CurPos);   //-1
+    //is it second byte of 2..4 byte codepoint?
+    CharLen := UTF8CharacterStrictLength(CurPos);
+    if (CharLen > 1) then Exit(True);
+    if (CurPos > AString) then
+    begin
+      Dec(CurPos);   //-2
+      //is it third byte of 3..4 byte codepoint?
+      CharLen := UTF8CharacterStrictLength(CurPos);
+      if (CharLen > 2) then Exit(True);
+      if (CurPos > AString) then
      begin
-        // is two byte character
+        Dec(CurPos);   //-3
+       //is it fouth byte of 4 byte codepoint?
+       CharLen := UTF8CharacterStrictLength(CurPos);
+       if (CharLen = 4) then Exit(True);
      end;
    end;
  end;
+  //At this point we failed: we are NOT inside a valid UTF8 codepoint!
+  CurPos := SavedPos;
 end;

+function Utf8TryFindCodepointStart(const AString: String; var Index: Integer; out CharLen: Integer): Boolean;
+var
+  CurPos, SavedCurPos: PChar;
+begin
+  CurPos := @AString[Index];
+  SavedCurPos := CurPos;
+  Result := Utf8TryFindCodepointStart(PChar(AString), CurPos, CharLen);
+  Index := Index - (SavedCurPos - CurPos);
+end;
+
+{ Find the start of the UTF8 character which contains BytePos,
+  if BytePos is not part of a valid Utf8Codepoint the function returns BytePos
+  Len is length in byte, BytePos starts at 0 }
+function UTF8FindNearestCharStart(UTF8Str: PChar; Len: SizeInt; BytePos: SizeInt): SizeInt;
+var
+  CurPos: PChar;
+  CharLen: Integer;
+begin
+  if (BytePos > Len-1) then BytePos := Len - 1;
+  CurPos := Utf8Str + BytePos;
+  //No need to check the result value, since when it retuns False CurPos will be reset
+  //to it's original value, and that's what we want to return in that case
+  Utf8TryFindCodepointStart(Utf8Str, CurPos, CharLen);
+  Result := CurPos - Utf8Str;
+end;
+
+
 { Len is the length in bytes of UTF8Str
  CharIndex is the position of the desired char (starting at 0), in chars
 }
--- a/docs/acknowledgements.txt
+++ b/docs/acknowledgements.txt
@ -15,6 +15,9 @@ http://www.famfamfam.com/lab/icons/silk/
 Tango Icon Library 
 http://tango.freedesktop.org/Tango_Icon_Library

+Fugue Icons
+http://p.yusukekamiyamane.com/
+

 #RemObjects Pascal Script 
 Created by Carlo Kok / RemObjects Software