* Improved UTF8 detection by ALexey

git-svn-id: trunk@47414 -
This commit is contained in:
michael 2020-11-13 23:30:23 +00:00
parent bf7961a901
commit 8bbd92bf82

View File

@ -43,8 +43,11 @@ function HasExtendCharacter(const S: RawByteString): boolean;
function DetectUTF8Encoding(const S: RawByteString): TEncodeType; function DetectUTF8Encoding(const S: RawByteString): TEncodeType;
function IsUTF8String(const S: RawByteString): boolean; function IsUTF8String(const S: RawByteString): boolean;
type
TBufferUTF8State = (u8sUnknown, u8sYes, u8sNo);
//PartialAllowed must be set to true if the buffer is smaller than the file. //PartialAllowed must be set to true if the buffer is smaller than the file.
function IsBufferUtf8(buf:PAnsiChar;PartialAllowed:boolean):boolean; function IsBufferUTF8(buf: PAnsiChar; bufSize: SizeInt; PartialAllowed: boolean): TBufferUTF8State;
implementation implementation
@ -76,34 +79,37 @@ begin
result:=(byte(thechar) and (128+64))=128; result:=(byte(thechar) and (128+64))=128;
end; end;
function IsBufferUtf8(buf:PAnsiChar;PartialAllowed:boolean):boolean; function IsBufferUTF8(buf: PAnsiChar; bufSize: SizeInt; PartialAllowed: boolean): TBufferUTF8State;
{Buffer contains only valid UTF-8 characters, no secondary alone, {Buffer contains only valid UTF-8 characters, no secondary alone,
no primary without the correct nr of secondary} no primary without the correct nr of secondary}
var p:PAnsiChar; var
utf8bytes:integer; p: PAnsiChar;
hadutf8bytes:boolean; i: SizeInt;
utf8bytes: integer;
hadutf8bytes: boolean;
begin begin
p:=buf; p:=buf;
hadutf8bytes:=false; hadutf8bytes:=false;
result:=false; result:=u8sUnknown;
utf8bytes:=0; utf8bytes:=0;
while p^<>#0 do for i:= 1 to bufSize do
begin begin
if utf8bytes>0 then if utf8bytes>0 then
begin {Expecting secondary AnsiChar} begin {Expecting secondary AnsiChar}
hadutf8bytes:=true; hadutf8bytes:=true;
if not IsSecondaryUTF8Char(p^) then exit; {Fail!} if not IsSecondaryUTF8Char(p^) then exit(u8sNo); {Fail!}
dec(utf8bytes); dec(utf8bytes);
end end
else else
if IsFirstUTF8Char(p^) then if IsFirstUTF8Char(p^) then
utf8bytes:=bytesFromUTF8[p^] utf8bytes:=bytesFromUTF8[p^]
else else
//if IsSecondaryUTF8Char(p^) then //Alexey: redundant check if IsSecondaryUTF8Char(p^) then
exit; {Fail!} exit(u8sNo); {Fail!}
inc(p); inc(p);
end; end;
result:=hadutf8bytes and (PartialAllowed or (utf8bytes=0)); if hadutf8bytes and (PartialAllowed or (utf8bytes=0)) then
result:=u8sYes;
end; end;
function WideReplaceStr(const AText, AFromText, AToText: WideString): WideString; inline; function WideReplaceStr(const AText, AFromText, AToText: WideString): WideString; inline;
@ -195,7 +201,7 @@ begin
if FirstExtChar=0 then if FirstExtChar=0 then
Result := etUSASCII Result := etUSASCII
else else
if IsBufferUtf8(@S[FirstExtChar], false) then if IsBufferUtf8(@S[FirstExtChar], Length(S)-FirstExtChar+1, false)=u8sYes then
Result := etUTF8 Result := etUTF8
else else
Result := etANSI; Result := etANSI;