diff --git a/packages/rtl-objpas/src/inc/widestrutils.pp b/packages/rtl-objpas/src/inc/widestrutils.pp index b8585fbab2..a80544bf96 100644 --- a/packages/rtl-objpas/src/inc/widestrutils.pp +++ b/packages/rtl-objpas/src/inc/widestrutils.pp @@ -1,4 +1,18 @@ -unit WideStrUtils; +{ + Delphi/Kylix compatibility unit: String handling routines. + + This file is part of the Free Pascal run time library. + Copyright (c) 1999-2005 by the Free Pascal development team + + See the file COPYING.FPC, included in this distribution, + for details about the copyright. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + + **********************************************************************} +unit widestrutils; {$mode objfpc} {$H+} @@ -7,7 +21,7 @@ unit WideStrUtils; interface uses - SysUtils; + SysUtils, Classes; function WideStringReplace(const S, OldPattern, NewPattern: WideString; Flags: TReplaceFlags): WideString; function WideReplaceStr(const AText, AFromText, AToText: WideString): WideString; inline; @@ -17,9 +31,81 @@ function UnicodeStringReplace(const S, OldPattern, NewPattern: UnicodeString; Fl function UnicodeReplaceStr(const AText, AFromText, AToText: UnicodeString): UnicodeString; inline; function UnicodeReplaceText(const AText, AFromText, AToText: UnicodeString): UnicodeString; inline; +type + TEncodeType = (etUSASCII, etUTF8, etANSI); + +const + sUTF8BOMString: array[1..3] of char = (#$EF, #$BB, #$BF); + +function HasUTF8BOM(S: TStream): boolean; overload; +function HasUTF8BOM(const S: RawByteString): boolean; overload; +function HasExtendCharacter(const S: RawByteString): boolean; +function DetectUTF8Encoding(const S: RawByteString): TEncodeType; +function IsUTF8String(const S: RawByteString): boolean; + +//PartialAllowed must be set to true if the buffer is smaller than the file. +function IsBufferUtf8(buf:PAnsiChar;PartialAllowed:boolean):boolean; implementation +{ + The IsBufferUtf8 function code was created by Christian Ghisler (ghisler.com) + Christian gave code to open-source at Total Commander public forum +} + +const bytesFromUTF8:array[AnsiChar] of byte = ( + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 32 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 64 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 96 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //128 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //160 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, //192 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, //224 + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5); //256 + + +function IsFirstUTF8Char(thechar:AnsiChar):boolean; inline; +{The remaining bytes in a multi-byte sequence have 10 as their two most significant bits.} +begin + result:=(byte(thechar) and (128+64))<>128; +end; + +function IsSecondaryUTF8Char(thechar:AnsiChar):boolean; inline; +{The remaining bytes in a multi-byte sequence have 10 as their two most significant bits.} +begin + result:=(byte(thechar) and (128+64))=128; +end; + +function IsBufferUtf8(buf:PAnsiChar;PartialAllowed:boolean):boolean; +{Buffer contains only valid UTF-8 characters, no secondary alone, +no primary without the correct nr of secondary} +var p:PAnsiChar; + utf8bytes:integer; + hadutf8bytes:boolean; +begin + p:=buf; + hadutf8bytes:=false; + result:=false; + utf8bytes:=0; + while p^<>#0 do + begin + if utf8bytes>0 then + begin {Expecting secondary AnsiChar} + hadutf8bytes:=true; + if not IsSecondaryUTF8Char(p^) then exit; {Fail!} + dec(utf8bytes); + end + else + if IsFirstUTF8Char(p^) then + utf8bytes:=bytesFromUTF8[p^] + else + //if IsSecondaryUTF8Char(p^) then //Alexey: redundant check + exit; {Fail!} + inc(p); + end; + result:=hadutf8bytes and (PartialAllowed or (utf8bytes=0)); +end; + function WideReplaceStr(const AText, AFromText, AToText: WideString): WideString; inline; begin Result := WideStringReplace(AText, AFromText, AToText, [rfReplaceAll]); @@ -52,5 +138,74 @@ begin Result:= sysutils.UnicodeStringReplace(S,OldPattern,NewPattern,Flags); end; +function HasUTF8BOM(S: TStream): boolean; +var + OldPos: Int64; + Buf: array[1..3] of char; +begin + Result := false; + if S.Size<3 then exit; + FillChar(Buf, SizeOf(Buf), 0); + try + OldPos := S.Position; + S.Position := 0; + if S.Read(Buf, 3)<>3 then exit; + Result := + (Buf[1]=sUTF8BOMString[1]) and + (Buf[2]=sUTF8BOMString[2]) and + (Buf[3]=sUTF8BOMString[3]); + finally + S.Position := OldPos; + end; +end; + +function HasUTF8BOM(const S: RawByteString): boolean; +begin + Result := (Length(S)>=3) and + (S[1]=sUTF8BOMString[1]) and + (S[2]=sUTF8BOMString[2]) and + (S[3]=sUTF8BOMString[3]); +end; + +function HasExtendCharacter(const S: RawByteString): boolean; +var + i: integer; +begin + for i := 1 to Length(S) do + if Ord(S[i])>=$80 then + begin + Result := true; + exit; + end; + Result := false; +end; + +function DetectUTF8Encoding(const S: RawByteString): TEncodeType; +var + FirstExtChar, i: integer; +begin + FirstExtChar := 0; + for i := 1 to Length(S) do + if Ord(S[i])>=$80 then + begin + FirstExtChar := i; + Break; + end; + + if FirstExtChar=0 then + Result := etUSASCII + else + if IsBufferUtf8(@S[FirstExtChar], false) then + Result := etUTF8 + else + Result := etANSI; +end; + +function IsUTF8String(const S: RawByteString): boolean; +begin + Result := DetectUTF8Encoding(S) = etUTF8; +end; + + end.