LCL: improved UTF8FindNearestCharStart

git-svn-id: trunk@27831 -
2025-06-17 07:48:22 +02:00 · 2010-10-24 17:44:33 +00:00 · 2010-10-24 17:44:33 +00:00 · af460b1644
commit af460b1644
parent b3b03bfaea
1 changed files with 33 additions and 12 deletions
--- a/lcl/lclproc.pas
+++ b/lcl/lclproc.pas
@ -3316,27 +3316,48 @@ begin
  end;
 end;
 { Find the start of the UTF8 character which contains BytePos,
  Len is length in byte, BytePos starts at 0 }
 function UTF8FindNearestCharStart(UTF8Str: PChar; Len: integer;
  BytePos: integer): integer;
 var
  CharLen: LongInt;
 begin
  Result:=0;
-  if UTF8Str<>nil then begin
+  if (UTF8Str<>nil) and (Len>0) and (BytePos>=0) then begin
-    if BytePos>Len then BytePos:=Len;
+    Result:=BytePos;
-    while (BytePos>0) do begin
+    if Result>Len then Result:=Len-1;
-      CharLen:=UTF8CharacterLength(UTF8Str);
+    if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
-      dec(BytePos,CharLen);
+      dec(Result);
-      if (BytePos<0) then exit;
+      if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
-      inc(Result,CharLen);
+        dec(Result);
-      if (BytePos=0) then exit;
+        if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
-      inc(UTF8Str,CharLen);
+          dec(Result);
          // should be four byte character
          if (ord(UTF8Str[Result]) and %11111000<>%11110000) then begin
            // broken UTF8 character
            inc(Result,3);
          end else begin
            // is four byte character
          end;
        end else if (ord(UTF8Str[Result]) and %11110000<>%11100000) then begin
          // broken UTF8 character, should be three byte
          inc(Result,2);
        end else
        begin
          // is three byte character
        end;
      end else if (ord(UTF8Str[Result]) and %11100000<>%11000000) then begin
        // broken UTF8 character, should be two byte
        inc(Result);
      end else
      begin
        // is two byte character
      end;
    end;
  end;
 end;
 { Len is the length in bytes of UTF8Str
-  CharIndex is the position of the desired char, in chars
+  CharIndex is the position of the desired char (starting at 0), in chars
  This function is similar to UTF8FindNearestCharStart
 }