* Fix bug ID #37315: add some delphi compatibility functions

git-svn-id: trunk@47393 -
2025-08-16 10:59:10 +02:00 · 2020-11-12 10:12:18 +00:00 · 2020-11-12 10:12:18 +00:00 · a8db09b37e
commit a8db09b37e
parent 4a38d1b34e
1 changed files with 157 additions and 2 deletions
--- a/packages/rtl-objpas/src/inc/widestrutils.pp
+++ b/packages/rtl-objpas/src/inc/widestrutils.pp
@ -1,4 +1,18 @@
-unit WideStrUtils;
+{
+    Delphi/Kylix compatibility unit: String handling routines.
+
+    This file is part of the Free Pascal run time library.
+    Copyright (c) 1999-2005 by the Free Pascal development team
+
+    See the file COPYING.FPC, included in this distribution,
+    for details about the copyright.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+
+ **********************************************************************}
+unit widestrutils;

 {$mode objfpc}
 {$H+}
@ -7,7 +21,7 @@ unit WideStrUtils;
 interface

 uses
-  SysUtils;
+  SysUtils, Classes;

 function WideStringReplace(const S, OldPattern, NewPattern: WideString; Flags: TReplaceFlags): WideString;
 function WideReplaceStr(const AText, AFromText, AToText: WideString): WideString; inline;
@ -17,9 +31,81 @@ function UnicodeStringReplace(const S, OldPattern, NewPattern: UnicodeString; Fl
 function UnicodeReplaceStr(const AText, AFromText, AToText: UnicodeString): UnicodeString; inline;
 function UnicodeReplaceText(const AText, AFromText, AToText: UnicodeString): UnicodeString; inline;

+type
+  TEncodeType = (etUSASCII, etUTF8, etANSI);
+
+const
+  sUTF8BOMString: array[1..3] of char = (#$EF, #$BB, #$BF);
+
+function HasUTF8BOM(S: TStream): boolean; overload;
+function HasUTF8BOM(const S: RawByteString): boolean; overload;
+function HasExtendCharacter(const S: RawByteString): boolean;
+function DetectUTF8Encoding(const S: RawByteString): TEncodeType;
+function IsUTF8String(const S: RawByteString): boolean;
+
+//PartialAllowed must be set to true if the buffer is smaller than the file.
+function IsBufferUtf8(buf:PAnsiChar;PartialAllowed:boolean):boolean;

 implementation

+{
+  The IsBufferUtf8 function code was created by Christian Ghisler (ghisler.com)
+  Christian gave code to open-source at Total Commander public forum
+}
+
+const bytesFromUTF8:array[AnsiChar] of byte = (
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  // 32
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  // 64
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  // 96
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  //128
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  //160
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  //192
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  //224
+  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5); //256
+
+
+function IsFirstUTF8Char(thechar:AnsiChar):boolean; inline;
+{The remaining bytes in a multi-byte sequence have 10 as their two most significant bits.}
+begin
+  result:=(byte(thechar) and (128+64))<>128;
+end;
+
+function IsSecondaryUTF8Char(thechar:AnsiChar):boolean; inline;
+{The remaining bytes in a multi-byte sequence have 10 as their two most significant bits.}
+begin
+  result:=(byte(thechar) and (128+64))=128;
+end;
+
+function IsBufferUtf8(buf:PAnsiChar;PartialAllowed:boolean):boolean;
+{Buffer contains only valid UTF-8 characters, no secondary alone,
+no primary without the correct nr of secondary}
+var p:PAnsiChar;
+    utf8bytes:integer;
+    hadutf8bytes:boolean;
+begin
+  p:=buf;
+  hadutf8bytes:=false;
+  result:=false;
+  utf8bytes:=0;
+  while p^<>#0 do
+  begin
+    if utf8bytes>0 then
+    begin  {Expecting secondary AnsiChar}
+      hadutf8bytes:=true;
+      if not IsSecondaryUTF8Char(p^) then exit;  {Fail!}
+      dec(utf8bytes);
+    end
+    else
+    if IsFirstUTF8Char(p^) then
+      utf8bytes:=bytesFromUTF8[p^]
+    else
+    //if IsSecondaryUTF8Char(p^) then //Alexey: redundant check
+      exit;  {Fail!}
+    inc(p);
+  end;
+    result:=hadutf8bytes and (PartialAllowed or (utf8bytes=0));
+end;
+
 function WideReplaceStr(const AText, AFromText, AToText: WideString): WideString; inline;
 begin
  Result := WideStringReplace(AText, AFromText, AToText, [rfReplaceAll]);
@ -52,5 +138,74 @@ begin
  Result:= sysutils.UnicodeStringReplace(S,OldPattern,NewPattern,Flags);
 end;

+function HasUTF8BOM(S: TStream): boolean;
+var
+  OldPos: Int64;
+  Buf: array[1..3] of char;
+begin
+  Result := false;
+  if S.Size<3 then exit;
+  FillChar(Buf, SizeOf(Buf), 0);
+  try
+    OldPos := S.Position;
+    S.Position := 0;
+    if S.Read(Buf, 3)<>3 then exit;
+    Result :=
+      (Buf[1]=sUTF8BOMString[1]) and
+      (Buf[2]=sUTF8BOMString[2]) and
+      (Buf[3]=sUTF8BOMString[3]);
+  finally
+    S.Position := OldPos;
+  end;
+end;
+
+function HasUTF8BOM(const S: RawByteString): boolean;
+begin
+  Result := (Length(S)>=3) and
+    (S[1]=sUTF8BOMString[1]) and
+    (S[2]=sUTF8BOMString[2]) and
+    (S[3]=sUTF8BOMString[3]);
+end;
+
+function HasExtendCharacter(const S: RawByteString): boolean;
+var
+  i: integer;
+begin
+  for i := 1 to Length(S) do
+    if Ord(S[i])>=$80 then
+    begin
+      Result := true;
+      exit;
+    end;
+  Result := false;
+end;
+
+function DetectUTF8Encoding(const S: RawByteString): TEncodeType;
+var
+  FirstExtChar, i: integer;
+begin
+  FirstExtChar := 0;
+  for i := 1 to Length(S) do
+    if Ord(S[i])>=$80 then
+    begin
+      FirstExtChar := i;
+      Break;
+    end;
+
+  if FirstExtChar=0 then
+    Result := etUSASCII
+  else
+  if IsBufferUtf8(@S[FirstExtChar], false) then
+    Result := etUTF8
+  else
+    Result := etANSI;
+end;
+
+function IsUTF8String(const S: RawByteString): boolean;
+begin
+  Result := DetectUTF8Encoding(S) = etUTF8;
+end;
+
+
 end.