mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-16 23:21:57 +02:00
* Improved UTF8 detection by ALexey
git-svn-id: trunk@47414 -
This commit is contained in:
parent
bf7961a901
commit
8bbd92bf82
@ -43,8 +43,11 @@ function HasExtendCharacter(const S: RawByteString): boolean;
|
||||
function DetectUTF8Encoding(const S: RawByteString): TEncodeType;
|
||||
function IsUTF8String(const S: RawByteString): boolean;
|
||||
|
||||
type
|
||||
TBufferUTF8State = (u8sUnknown, u8sYes, u8sNo);
|
||||
|
||||
//PartialAllowed must be set to true if the buffer is smaller than the file.
|
||||
function IsBufferUtf8(buf:PAnsiChar;PartialAllowed:boolean):boolean;
|
||||
function IsBufferUTF8(buf: PAnsiChar; bufSize: SizeInt; PartialAllowed: boolean): TBufferUTF8State;
|
||||
|
||||
implementation
|
||||
|
||||
@ -76,34 +79,37 @@ begin
|
||||
result:=(byte(thechar) and (128+64))=128;
|
||||
end;
|
||||
|
||||
function IsBufferUtf8(buf:PAnsiChar;PartialAllowed:boolean):boolean;
|
||||
function IsBufferUTF8(buf: PAnsiChar; bufSize: SizeInt; PartialAllowed: boolean): TBufferUTF8State;
|
||||
{Buffer contains only valid UTF-8 characters, no secondary alone,
|
||||
no primary without the correct nr of secondary}
|
||||
var p:PAnsiChar;
|
||||
utf8bytes:integer;
|
||||
hadutf8bytes:boolean;
|
||||
var
|
||||
p: PAnsiChar;
|
||||
i: SizeInt;
|
||||
utf8bytes: integer;
|
||||
hadutf8bytes: boolean;
|
||||
begin
|
||||
p:=buf;
|
||||
hadutf8bytes:=false;
|
||||
result:=false;
|
||||
result:=u8sUnknown;
|
||||
utf8bytes:=0;
|
||||
while p^<>#0 do
|
||||
for i:= 1 to bufSize do
|
||||
begin
|
||||
if utf8bytes>0 then
|
||||
begin {Expecting secondary AnsiChar}
|
||||
hadutf8bytes:=true;
|
||||
if not IsSecondaryUTF8Char(p^) then exit; {Fail!}
|
||||
if not IsSecondaryUTF8Char(p^) then exit(u8sNo); {Fail!}
|
||||
dec(utf8bytes);
|
||||
end
|
||||
else
|
||||
if IsFirstUTF8Char(p^) then
|
||||
utf8bytes:=bytesFromUTF8[p^]
|
||||
else
|
||||
//if IsSecondaryUTF8Char(p^) then //Alexey: redundant check
|
||||
exit; {Fail!}
|
||||
if IsSecondaryUTF8Char(p^) then
|
||||
exit(u8sNo); {Fail!}
|
||||
inc(p);
|
||||
end;
|
||||
result:=hadutf8bytes and (PartialAllowed or (utf8bytes=0));
|
||||
if hadutf8bytes and (PartialAllowed or (utf8bytes=0)) then
|
||||
result:=u8sYes;
|
||||
end;
|
||||
|
||||
function WideReplaceStr(const AText, AFromText, AToText: WideString): WideString; inline;
|
||||
@ -195,7 +201,7 @@ begin
|
||||
if FirstExtChar=0 then
|
||||
Result := etUSASCII
|
||||
else
|
||||
if IsBufferUtf8(@S[FirstExtChar], false) then
|
||||
if IsBufferUtf8(@S[FirstExtChar], Length(S)-FirstExtChar+1, false)=u8sYes then
|
||||
Result := etUTF8
|
||||
else
|
||||
Result := etANSI;
|
||||
|
Loading…
Reference in New Issue
Block a user