* Improved UTF8 detection by ALexey

git-svn-id: trunk@47414 -
2025-08-15 19:49:12 +02:00 · 2020-11-13 23:30:23 +00:00 · 2020-11-13 23:30:23 +00:00 · 8bbd92bf82
commit 8bbd92bf82
parent bf7961a901
1 changed files with 18 additions and 12 deletions
--- a/packages/rtl-objpas/src/inc/widestrutils.pp
+++ b/packages/rtl-objpas/src/inc/widestrutils.pp
@ -43,8 +43,11 @@ function HasExtendCharacter(const S: RawByteString): boolean;
 function DetectUTF8Encoding(const S: RawByteString): TEncodeType;
 function IsUTF8String(const S: RawByteString): boolean;
 type
  TBufferUTF8State = (u8sUnknown, u8sYes, u8sNo);
 //PartialAllowed must be set to true if the buffer is smaller than the file.
-function IsBufferUtf8(buf:PAnsiChar;PartialAllowed:boolean):boolean;
+function IsBufferUTF8(buf: PAnsiChar; bufSize: SizeInt; PartialAllowed: boolean): TBufferUTF8State;
 implementation
@ -76,34 +79,37 @@ begin
  result:=(byte(thechar) and (128+64))=128;
 end;
-function IsBufferUtf8(buf:PAnsiChar;PartialAllowed:boolean):boolean;
+function IsBufferUTF8(buf: PAnsiChar; bufSize: SizeInt; PartialAllowed: boolean): TBufferUTF8State;
 {Buffer contains only valid UTF-8 characters, no secondary alone,
 no primary without the correct nr of secondary}
-var p:PAnsiChar;
+var
-    utf8bytes:integer;
+  p: PAnsiChar;
-    hadutf8bytes:boolean;
+  i: SizeInt;
  utf8bytes: integer;
  hadutf8bytes: boolean;
 begin
  p:=buf;
  hadutf8bytes:=false;
-  result:=false;
+  result:=u8sUnknown;
  utf8bytes:=0;
-  while p^<>#0 do
+  for i:= 1 to bufSize do
  begin
    if utf8bytes>0 then
    begin  {Expecting secondary AnsiChar}
      hadutf8bytes:=true;
-      if not IsSecondaryUTF8Char(p^) then exit;  {Fail!}
+      if not IsSecondaryUTF8Char(p^) then exit(u8sNo);  {Fail!}
      dec(utf8bytes);
    end
    else
    if IsFirstUTF8Char(p^) then
      utf8bytes:=bytesFromUTF8[p^]
    else
-    //if IsSecondaryUTF8Char(p^) then //Alexey: redundant check
+    if IsSecondaryUTF8Char(p^) then
-      exit;  {Fail!}
+      exit(u8sNo);  {Fail!}
    inc(p);
  end;
-    result:=hadutf8bytes and (PartialAllowed or (utf8bytes=0));
+  if hadutf8bytes and (PartialAllowed or (utf8bytes=0)) then
    result:=u8sYes;
 end;
 function WideReplaceStr(const AText, AFromText, AToText: WideString): WideString; inline;
@ -195,7 +201,7 @@ begin
  if FirstExtChar=0 then
    Result := etUSASCII
  else
-  if IsBufferUtf8(@S[FirstExtChar], false) then
+  if IsBufferUtf8(@S[FirstExtChar], Length(S)-FirstExtChar+1, false)=u8sYes then
    Result := etUTF8
  else
    Result := etANSI;