mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-08-15 19:49:12 +02:00
* Improved UTF8 detection by ALexey
git-svn-id: trunk@47414 -
This commit is contained in:
parent
bf7961a901
commit
8bbd92bf82
@ -43,8 +43,11 @@ function HasExtendCharacter(const S: RawByteString): boolean;
|
|||||||
function DetectUTF8Encoding(const S: RawByteString): TEncodeType;
|
function DetectUTF8Encoding(const S: RawByteString): TEncodeType;
|
||||||
function IsUTF8String(const S: RawByteString): boolean;
|
function IsUTF8String(const S: RawByteString): boolean;
|
||||||
|
|
||||||
|
type
|
||||||
|
TBufferUTF8State = (u8sUnknown, u8sYes, u8sNo);
|
||||||
|
|
||||||
//PartialAllowed must be set to true if the buffer is smaller than the file.
|
//PartialAllowed must be set to true if the buffer is smaller than the file.
|
||||||
function IsBufferUtf8(buf:PAnsiChar;PartialAllowed:boolean):boolean;
|
function IsBufferUTF8(buf: PAnsiChar; bufSize: SizeInt; PartialAllowed: boolean): TBufferUTF8State;
|
||||||
|
|
||||||
implementation
|
implementation
|
||||||
|
|
||||||
@ -76,34 +79,37 @@ begin
|
|||||||
result:=(byte(thechar) and (128+64))=128;
|
result:=(byte(thechar) and (128+64))=128;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
function IsBufferUtf8(buf:PAnsiChar;PartialAllowed:boolean):boolean;
|
function IsBufferUTF8(buf: PAnsiChar; bufSize: SizeInt; PartialAllowed: boolean): TBufferUTF8State;
|
||||||
{Buffer contains only valid UTF-8 characters, no secondary alone,
|
{Buffer contains only valid UTF-8 characters, no secondary alone,
|
||||||
no primary without the correct nr of secondary}
|
no primary without the correct nr of secondary}
|
||||||
var p:PAnsiChar;
|
var
|
||||||
utf8bytes:integer;
|
p: PAnsiChar;
|
||||||
hadutf8bytes:boolean;
|
i: SizeInt;
|
||||||
|
utf8bytes: integer;
|
||||||
|
hadutf8bytes: boolean;
|
||||||
begin
|
begin
|
||||||
p:=buf;
|
p:=buf;
|
||||||
hadutf8bytes:=false;
|
hadutf8bytes:=false;
|
||||||
result:=false;
|
result:=u8sUnknown;
|
||||||
utf8bytes:=0;
|
utf8bytes:=0;
|
||||||
while p^<>#0 do
|
for i:= 1 to bufSize do
|
||||||
begin
|
begin
|
||||||
if utf8bytes>0 then
|
if utf8bytes>0 then
|
||||||
begin {Expecting secondary AnsiChar}
|
begin {Expecting secondary AnsiChar}
|
||||||
hadutf8bytes:=true;
|
hadutf8bytes:=true;
|
||||||
if not IsSecondaryUTF8Char(p^) then exit; {Fail!}
|
if not IsSecondaryUTF8Char(p^) then exit(u8sNo); {Fail!}
|
||||||
dec(utf8bytes);
|
dec(utf8bytes);
|
||||||
end
|
end
|
||||||
else
|
else
|
||||||
if IsFirstUTF8Char(p^) then
|
if IsFirstUTF8Char(p^) then
|
||||||
utf8bytes:=bytesFromUTF8[p^]
|
utf8bytes:=bytesFromUTF8[p^]
|
||||||
else
|
else
|
||||||
//if IsSecondaryUTF8Char(p^) then //Alexey: redundant check
|
if IsSecondaryUTF8Char(p^) then
|
||||||
exit; {Fail!}
|
exit(u8sNo); {Fail!}
|
||||||
inc(p);
|
inc(p);
|
||||||
end;
|
end;
|
||||||
result:=hadutf8bytes and (PartialAllowed or (utf8bytes=0));
|
if hadutf8bytes and (PartialAllowed or (utf8bytes=0)) then
|
||||||
|
result:=u8sYes;
|
||||||
end;
|
end;
|
||||||
|
|
||||||
function WideReplaceStr(const AText, AFromText, AToText: WideString): WideString; inline;
|
function WideReplaceStr(const AText, AFromText, AToText: WideString): WideString; inline;
|
||||||
@ -195,7 +201,7 @@ begin
|
|||||||
if FirstExtChar=0 then
|
if FirstExtChar=0 then
|
||||||
Result := etUSASCII
|
Result := etUSASCII
|
||||||
else
|
else
|
||||||
if IsBufferUtf8(@S[FirstExtChar], false) then
|
if IsBufferUtf8(@S[FirstExtChar], Length(S)-FirstExtChar+1, false)=u8sYes then
|
||||||
Result := etUTF8
|
Result := etUTF8
|
||||||
else
|
else
|
||||||
Result := etANSI;
|
Result := etANSI;
|
||||||
|
Loading…
Reference in New Issue
Block a user