* Improved UTF8 detection by ALexey

git-svn-id: trunk@47414 -
This commit is contained in:
michael 2020-11-13 23:30:23 +00:00
parent bf7961a901
commit 8bbd92bf82

View File

@ -43,8 +43,11 @@ function HasExtendCharacter(const S: RawByteString): boolean;
function DetectUTF8Encoding(const S: RawByteString): TEncodeType;
function IsUTF8String(const S: RawByteString): boolean;
type
TBufferUTF8State = (u8sUnknown, u8sYes, u8sNo);
//PartialAllowed must be set to true if the buffer is smaller than the file.
function IsBufferUtf8(buf:PAnsiChar;PartialAllowed:boolean):boolean;
function IsBufferUTF8(buf: PAnsiChar; bufSize: SizeInt; PartialAllowed: boolean): TBufferUTF8State;
implementation
@ -76,34 +79,37 @@ begin
result:=(byte(thechar) and (128+64))=128;
end;
function IsBufferUtf8(buf:PAnsiChar;PartialAllowed:boolean):boolean;
function IsBufferUTF8(buf: PAnsiChar; bufSize: SizeInt; PartialAllowed: boolean): TBufferUTF8State;
{Buffer contains only valid UTF-8 characters, no secondary alone,
no primary without the correct nr of secondary}
var p:PAnsiChar;
utf8bytes:integer;
hadutf8bytes:boolean;
var
p: PAnsiChar;
i: SizeInt;
utf8bytes: integer;
hadutf8bytes: boolean;
begin
p:=buf;
hadutf8bytes:=false;
result:=false;
result:=u8sUnknown;
utf8bytes:=0;
while p^<>#0 do
for i:= 1 to bufSize do
begin
if utf8bytes>0 then
begin {Expecting secondary AnsiChar}
hadutf8bytes:=true;
if not IsSecondaryUTF8Char(p^) then exit; {Fail!}
if not IsSecondaryUTF8Char(p^) then exit(u8sNo); {Fail!}
dec(utf8bytes);
end
else
if IsFirstUTF8Char(p^) then
utf8bytes:=bytesFromUTF8[p^]
else
//if IsSecondaryUTF8Char(p^) then //Alexey: redundant check
exit; {Fail!}
if IsSecondaryUTF8Char(p^) then
exit(u8sNo); {Fail!}
inc(p);
end;
result:=hadutf8bytes and (PartialAllowed or (utf8bytes=0));
if hadutf8bytes and (PartialAllowed or (utf8bytes=0)) then
result:=u8sYes;
end;
function WideReplaceStr(const AText, AFromText, AToText: WideString): WideString; inline;
@ -195,7 +201,7 @@ begin
if FirstExtChar=0 then
Result := etUSASCII
else
if IsBufferUtf8(@S[FirstExtChar], false) then
if IsBufferUtf8(@S[FirstExtChar], Length(S)-FirstExtChar+1, false)=u8sYes then
Result := etUTF8
else
Result := etANSI;