mirror of
https://gitlab.com/freepascal.org/lazarus/lazarus.git
synced 2025-05-20 02:02:33 +02:00

- An attempt to unify the defines for the different scenario's in the use of (not) codepage aware ansistrings and the use of the "Utf8 in RTL" feature. It makes for better separation of code and thus better readability and ease of maintainance (and in a later stadium it makes it easier to remove code that deals with non codepage aware ansistrings (fpc < 3.0)). - Also replace (FPC_FULLVERSION >= xxxx) with FPC_HAS_CPSTRING where appropriate. - Replace the custom HasCP define with built in FPC_HAS_CPSTRING define. git-svn-id: trunk@50498 -
534 lines
14 KiB
ObjectPascal
534 lines
14 KiB
ObjectPascal
{
|
|
*****************************************************************************
|
|
paswstring.pas
|
|
|
|
A widestring manager written in Pascal
|
|
and optimized for DefaultSystemCodePage CP_UTF8.
|
|
|
|
*****************************************************************************
|
|
This file is part of the LazUtils package
|
|
|
|
See the file COPYING.modifiedLGPL.txt, included in this distribution,
|
|
for details about the license.
|
|
*****************************************************************************
|
|
}
|
|
unit PasWString;
|
|
|
|
{$mode objfpc}
|
|
{$inline on}
|
|
{$i lazutils_defines.inc}
|
|
|
|
//{$define PASWSTRING_VERBOSE}
|
|
//{.$define PASWSTRING_SUPPORT_NONUTF8_ANSISTRING} disabled by default because
|
|
// non utf-8 ansistring is rare in UNIXes and lconvencoding makes the executable big
|
|
|
|
// sanity checks for defines
|
|
//{$IF FPC_FULLVERSION >= 30000}
|
|
{$IFnDEF NO_CP_RTL}
|
|
{$IFDEF UTF8_RTL}
|
|
{$IFDEF PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
|
|
{$error UTF8 or not UTF8}
|
|
{$ENDIF}
|
|
{$ENDIF}
|
|
{$DEFINE DisablePasWString}
|
|
{$ENDIF}
|
|
|
|
interface
|
|
|
|
uses
|
|
SysUtils, LazUTF8
|
|
{$ifdef PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}, lconvencoding{$endif}
|
|
;
|
|
|
|
{$IFnDEF DisablePasWString}
|
|
procedure SetPasWidestringManager;
|
|
{$ENDIF}
|
|
|
|
implementation
|
|
|
|
{$IFnDEF DisablePasWString}
|
|
procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
|
|
|
|
function IsASCII(const s: string): boolean; inline;
|
|
var
|
|
i: Integer;
|
|
begin
|
|
for i:=1 to length(s) do if ord(s[i])>127 then exit(false);
|
|
Result:=true;
|
|
end;
|
|
|
|
// len comes in widechars, not bytes
|
|
procedure Wide2AnsiMove(source:pwidechar;var dest:ansistring;len:SizeInt);
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('Wide2AnsiMove START');{$endif}
|
|
dest := UTF16ToUTF8(Source,len);
|
|
{$ifdef PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
|
|
// And correct to the real Ansi encoding
|
|
dest := ConvertEncoding(dest, EncodingUTF8, GetDefaultTextEncoding());
|
|
{$endif}
|
|
end;
|
|
|
|
procedure Ansi2WideMove(source:pchar;var dest:widestring;len:SizeInt);
|
|
{$ifdef PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
|
|
var
|
|
ansistr: ansistring;
|
|
{$endif}
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('Ansi2WideMove START');{$endif}
|
|
|
|
{$ifdef PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
|
|
// Copy the originating string taking into account the specified length
|
|
SetLength(ansistr, len);
|
|
System.Move(source^, ansistr[1], len);
|
|
// Convert to UTF-8
|
|
ansistr := ConvertEncoding(ansistr, GetDefaultTextEncoding(), EncodingUTF8);
|
|
// Now convert it, using UTF-8 -> UTF-16
|
|
dest := UTF8ToUTF16(ansistr);
|
|
{$else}
|
|
dest := UTF8ToUTF16(source,len);
|
|
{$endif}
|
|
end;
|
|
|
|
function LowerWideString(const s : WideString) : WideString;
|
|
var
|
|
str: string;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('LowerWideString START');{$endif}
|
|
str := UTF16ToUTF8(PWideChar(s),length(s));
|
|
str := UTF8LowerCase(str);
|
|
Result := UTF8ToUTF16(str);
|
|
end;
|
|
|
|
function UpperWideString(const s : WideString) : WideString;
|
|
var
|
|
str: string;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('UpperWideString START');{$endif}
|
|
str := UTF16ToUTF8(PWideChar(s),length(s));
|
|
str := UTF8UpperCase(str);
|
|
Result := UTF8ToUTF16(str);
|
|
end;
|
|
|
|
procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
|
|
var
|
|
l: SizeUInt;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('EnsureAnsiLen START');{$endif}
|
|
l:=length(s);
|
|
if (len>l) then
|
|
if (l < 128) then
|
|
setlength(s,l+8)
|
|
else
|
|
setlength(s,l+l shr 8);
|
|
end;
|
|
|
|
|
|
procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('ConcatCharToAnsiStr START');{$endif}
|
|
EnsureAnsiLen(s,index);
|
|
pchar(@s[index])^:=c;
|
|
inc(index);
|
|
end;
|
|
|
|
function LowerAnsiString(const s : AnsiString) : AnsiString;
|
|
var
|
|
Str: string;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('LowerAnsiString START');{$endif}
|
|
Str := SysToUTF8(s);
|
|
Str := UTF8LowerCase(Str);
|
|
Result := UTF8ToSys(Str);
|
|
end;
|
|
|
|
function UpperAnsiString(const s : AnsiString) : AnsiString;
|
|
var
|
|
Str: string;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('UpperAnsiString START');{$endif}
|
|
Str := SysToUTF8(s);
|
|
Str := UTF8UpperCase(Str);
|
|
Result := UTF8ToSys(Str);
|
|
end;
|
|
|
|
// Just do a simple byte comparison
|
|
// A more complex analysis would require normalization
|
|
function WideCompareStr(const s1, s2 : WideString) : PtrInt;
|
|
var
|
|
count, count1, count2: integer;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('WideCompareStr START');{$endif}
|
|
result := 0;
|
|
Count1 := Length(S1);
|
|
Count2 := Length(S2);
|
|
if Count1>Count2 then
|
|
Count:=Count2
|
|
else
|
|
Count:=Count1;
|
|
result := SysUtils.CompareMemRange(Pointer(S1),Pointer(S2), Count*2);
|
|
if result=0 then
|
|
result:=Count1-Count2;
|
|
end;
|
|
|
|
function WideCompareText(const s1, s2 : WideString): PtrInt;
|
|
var
|
|
a, b: String;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('WideCompareText START');{$endif}
|
|
a := UTF16ToUTF8(PWideChar(s1),length(s1));
|
|
a := UTF8LowerCase(a);
|
|
b := UTF16ToUTF8(PWideChar(s2),length(s2));
|
|
b := UTF8LowerCase(b);
|
|
result := UTF8CompareText(a,b);
|
|
end;
|
|
|
|
function CharLengthPChar(const Str: PChar): PtrInt;
|
|
// return the number of codepoints (including invalid codepoints)
|
|
var
|
|
p: PChar;
|
|
l: Integer;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('CharLengthPChar START');{$endif}
|
|
p:=Str;
|
|
if p=nil then exit(0);
|
|
while p^<>#0 do begin
|
|
l:=UTF8CharacterLength(p);
|
|
inc(Result);
|
|
inc(p,l);
|
|
end;
|
|
end;
|
|
|
|
function CodePointLengthPChar(const p: PChar; MaxLookAhead: PtrInt): Ptrint;
|
|
{ return value:
|
|
-1 if incomplete or invalid code point
|
|
0 if NULL character,
|
|
> 0 if that's the length in bytes of the code point }
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('CodePointLengthPChar START');{$endif}
|
|
if (p=nil) then exit(0);
|
|
if (MaxLookAhead<0) then exit(-1);
|
|
if ord(p^)<%10000000 then begin
|
|
// regular single byte character
|
|
if p^=#0 then
|
|
exit(0)
|
|
else
|
|
exit(1);
|
|
end;
|
|
if ord(p^)<%11000000 then begin
|
|
// invalid single byte character
|
|
exit(-1);
|
|
end;
|
|
if (MaxLookAhead=0) then exit(-1);
|
|
if ((ord(p^) and %11100000) = %11000000) then begin
|
|
// should be 2 byte character
|
|
if (ord(p[1]) and %11000000) = %10000000 then
|
|
exit(2)
|
|
else
|
|
exit(-1);
|
|
end;
|
|
if (MaxLookAhead=1) then exit(-1);
|
|
if ((ord(p^) and %11110000) = %11100000) then begin
|
|
// should be 3 byte character
|
|
if ((ord(p[1]) and %11000000) = %10000000)
|
|
and ((ord(p[2]) and %11000000) = %10000000) then
|
|
exit(3)
|
|
else
|
|
exit(-1);
|
|
end;
|
|
if (MaxLookAhead=2) then exit(-1);
|
|
if ((ord(p^) and %11111000) = %11110000) then begin
|
|
// should be 4 byte character
|
|
if ((ord(p[1]) and %11000000) = %10000000)
|
|
and ((ord(p[2]) and %11000000) = %10000000)
|
|
and ((ord(p[3]) and %11000000) = %10000000) then
|
|
exit(4)
|
|
else
|
|
exit(-1);
|
|
end;
|
|
exit(-1);
|
|
end;
|
|
|
|
function AnsiCompareStr(const s1, s2: ansistring): PtrInt;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiCompareStr START');{$endif}
|
|
Result := SysUtils.CompareStr(s1, s2);
|
|
end;
|
|
|
|
// Similar to AnsiCompareStr, but with PChar
|
|
function StrCompAnsi(s1,s2 : PChar): PtrInt;
|
|
var
|
|
Count1: SizeInt;
|
|
Count2: SizeInt;
|
|
Count: SizeInt;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('StrCompAnsi START');{$endif}
|
|
result := 0;
|
|
Count1:=StrLen(s1);
|
|
Count2:=StrLen(s2);
|
|
if Count1>Count2 then
|
|
Count:=Count2
|
|
else
|
|
Count:=Count1;
|
|
result := CompareMemRange(s1, s2, Count);
|
|
if result=0 then
|
|
result:=Count1-Count2;
|
|
end;
|
|
|
|
|
|
function AnsiCompareText(const S1, S2: ansistring): PtrInt;
|
|
var
|
|
str1, str2: string;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiCompareText START');{$endif}
|
|
str1 := SysToUTF8(S1);
|
|
str2 := SysToUTF8(S2);
|
|
Result := UTF8CompareText(str1, str2);
|
|
end;
|
|
|
|
|
|
function AnsiStrIComp(S1, S2: PChar): PtrInt;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiStrIComp START');{$endif}
|
|
Result := AnsiCompareText(StrPas(s1),StrPas(s2));
|
|
end;
|
|
|
|
|
|
function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
|
|
var
|
|
a, b: pchar;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiStrLComp START');{$endif}
|
|
Result := 0;
|
|
if (maxlen=0) then
|
|
exit(0);
|
|
if (s1[maxlen]<>#0) then
|
|
begin
|
|
getmem(a,maxlen+1);
|
|
move(s1^,a^,maxlen);
|
|
a[maxlen]:=#0;
|
|
end
|
|
else
|
|
a:=s1;
|
|
if (s2[maxlen]<>#0) then
|
|
begin
|
|
getmem(b,maxlen+1);
|
|
move(s2^,b^,maxlen);
|
|
b[maxlen]:=#0;
|
|
end
|
|
else
|
|
b:=s2;
|
|
result:=StrCompAnsi(a,b);
|
|
if (a<>s1) then
|
|
freemem(a);
|
|
if (b<>s2) then
|
|
freemem(b);
|
|
end;
|
|
|
|
|
|
function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
|
|
var
|
|
a, b: ansistring;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiStrLIComp START');{$endif}
|
|
if (maxlen=0) then
|
|
exit(0);
|
|
setlength(a,maxlen);
|
|
move(s1^,a[1],maxlen);
|
|
setlength(b,maxlen);
|
|
move(s2^,b[1],maxlen);
|
|
result:=AnsiCompareText(a,b);
|
|
end;
|
|
|
|
|
|
procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
|
|
var
|
|
newlen: sizeint;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('ansi2pchar START');{$endif}
|
|
newlen:=length(s);
|
|
if newlen>strlen(orgp) then
|
|
fpc_rangeerror;
|
|
p:=orgp;
|
|
if (newlen>0) then
|
|
move(s[1],p[0],newlen);
|
|
p[newlen]:=#0;
|
|
end;
|
|
|
|
|
|
function AnsiStrLower(Str: PChar): PChar;
|
|
var
|
|
temp: ansistring;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiStrLower START');{$endif}
|
|
temp:=loweransistring(str);
|
|
ansi2pchar(temp,str,result);
|
|
end;
|
|
|
|
|
|
function AnsiStrUpper(Str: PChar): PChar;
|
|
var
|
|
temp: ansistring;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('AnsiStrUpper START');{$endif}
|
|
temp:=upperansistring(str);
|
|
ansi2pchar(temp,str,result);
|
|
end;
|
|
|
|
|
|
procedure InitThread;
|
|
begin
|
|
end;
|
|
|
|
|
|
procedure FiniThread;
|
|
begin
|
|
end;
|
|
|
|
{ Unicode }
|
|
|
|
procedure Unicode2AnsiMove(source:pwidechar;var dest:ansistring;len:SizeInt);
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('Unicode2AnsiMove START');{$endif}
|
|
dest := UTF16ToUTF8(source,len);
|
|
{$ifdef PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
|
|
// And correct to the real Ansi encoding
|
|
dest := ConvertEncoding(dest, EncodingUTF8, GetDefaultTextEncoding());
|
|
{$endif}
|
|
end;
|
|
|
|
procedure Ansi2UnicodeMove(source:pchar;var dest:UnicodeString;len:SizeInt);
|
|
{$IFDEF PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
|
|
var
|
|
ansistr: ansistring;
|
|
{$ENDIF}
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('Ansi2UnicodeMove START');{$endif}
|
|
{$IFDEF PASWSTRING_SUPPORT_NONUTF8_ANSISTRING}
|
|
if NeedRTLAnsi then begin
|
|
// Copy the originating string taking into account the specified length
|
|
SetLength(ansistr, len);
|
|
System.Move(source^, ansistr[1], len);
|
|
// Convert to UTF-8
|
|
ansistr := ConvertEncoding(ansistr, GetDefaultTextEncoding(), EncodingUTF8);
|
|
// Now convert it, using UTF-8 -> UTF-16
|
|
dest := UTF8ToUTF16(ansistr);
|
|
end else begin
|
|
dest := UTF8ToUTF16(source,len);
|
|
end;
|
|
{$ELSE}
|
|
dest := UTF8ToUTF16(source,len);
|
|
{$ENDIF}
|
|
end;
|
|
|
|
function UpperUnicodeString(const s : UnicodeString) : UnicodeString;
|
|
var
|
|
str: string;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('UpperUnicodeString START');{$endif}
|
|
str := UTF16ToUTF8(s);
|
|
str := UTF8UpperCase(str);
|
|
Result := UTF8ToUTF16(str);
|
|
end;
|
|
|
|
function LowerUnicodeString(const s : UnicodeString) : UnicodeString;
|
|
var
|
|
str: string;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('LowerUnicodeString START');{$endif}
|
|
str := UTF16ToUTF8(s);
|
|
str := UTF8LowerCase(str);
|
|
Result := UTF8ToUTF16(str);
|
|
end;
|
|
|
|
// Just do a simple byte comparison
|
|
// A more complex analysis would require normalization
|
|
function PasUnicodeCompareStr(const s1, s2 : unicodestring) : PtrInt;
|
|
var
|
|
count, count1, count2: integer;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('PasUnicodeCompareStr START');{$endif}
|
|
result := 0;
|
|
Count1 := Length(S1);
|
|
Count2 := Length(S2);
|
|
if Count1>Count2 then
|
|
Count:=Count2
|
|
else
|
|
Count:=Count1;
|
|
result := SysUtils.CompareMemRange(Pointer(S1),Pointer(S2), Count*2);
|
|
if result=0 then
|
|
result:=Count1-Count2;
|
|
end;
|
|
|
|
function PasUnicodeCompareText(const s1, s2 : unicodestring): PtrInt;
|
|
var
|
|
a, b: string;
|
|
begin
|
|
{$ifdef PASWSTRING_VERBOSE}WriteLn('PasUnicodeCompareText START');{$endif}
|
|
a := UTF16ToUTF8(s1);
|
|
a := UTF8LowerCase(a);
|
|
b := UTF16ToUTF8(s2);
|
|
b := UTF8LowerCase(b);
|
|
result := UTF8CompareText(a,b);
|
|
end;
|
|
|
|
Procedure SetPasWideStringManager;
|
|
Var
|
|
PasWideStringManager : TUnicodeStringManager;
|
|
begin
|
|
PasWideStringManager:=widestringmanager;
|
|
PasWideStringManager.Wide2AnsiMoveProc:=@Wide2AnsiMove;
|
|
PasWideStringManager.Ansi2WideMoveProc:=@Ansi2WideMove;
|
|
|
|
// UpperUTF8 : procedure(p:PUTF8String);
|
|
PasWideStringManager.UpperWideStringProc:=@UpperWideString;
|
|
// UpperUCS4 : procedure(p:PUCS4Char);
|
|
// LowerUTF8 : procedure(p:PUTF8String);
|
|
PasWideStringManager.LowerWideStringProc:=@LowerWideString;
|
|
// LowerUCS4 : procedure(p:PUCS4Char);
|
|
|
|
{
|
|
CompUTF8 : function(p1,p2:PUTF8String) : shortint;
|
|
CompUCS2 : function(p1,p2:PUCS2Char) : shortint;
|
|
CompUCS4 : function(p1,p2:PUC42Char) : shortint;
|
|
}
|
|
PasWideStringManager.CompareWideStringProc:=@WideCompareStr;
|
|
PasWideStringManager.CompareTextWideStringProc:=@WideCompareText;
|
|
|
|
{ return value: number of code points in the string. Whenever an invalid
|
|
code point is encountered, all characters part of this invalid code point
|
|
are considered to form one "character" and the next character is
|
|
considered to be the start of a new (possibly also invalid) code point }
|
|
PasWideStringManager.CharLengthPCharProc:=@CharLengthPChar;
|
|
PasWideStringManager.CodePointLengthProc:=@CodePointLengthPChar;
|
|
|
|
{ Ansi }
|
|
PasWideStringManager.UpperAnsiStringProc:=@UpperAnsiString;
|
|
PasWideStringManager.LowerAnsiStringProc:=@LowerAnsiString;
|
|
PasWideStringManager.CompareStrAnsiStringProc:=@AnsiCompareStr;
|
|
PasWideStringManager.CompareTextAnsiStringProc:=@AnsiCompareText;
|
|
PasWideStringManager.StrCompAnsiStringProc:=@StrCompAnsi;
|
|
PasWideStringManager.StrICompAnsiStringProc:=@AnsiStrIComp;
|
|
PasWideStringManager.StrLCompAnsiStringProc:=@AnsiStrLComp;
|
|
PasWideStringManager.StrLICompAnsiStringProc:=@AnsiStrLIComp;
|
|
PasWideStringManager.StrLowerAnsiStringProc:=@AnsiStrLower;
|
|
PasWideStringManager.StrUpperAnsiStringProc:=@AnsiStrUpper;
|
|
PasWideStringManager.ThreadInitProc:=@InitThread;
|
|
PasWideStringManager.ThreadFiniProc:=@FiniThread;
|
|
|
|
{ Unicode }
|
|
PasWideStringManager.Unicode2AnsiMoveProc:=@Unicode2AnsiMove;
|
|
PasWideStringManager.Ansi2UnicodeMoveProc:=@Ansi2UnicodeMove;
|
|
PasWideStringManager.UpperUnicodeStringProc:=@UpperUnicodeString;
|
|
PasWideStringManager.LowerUnicodeStringProc:=@LowerUnicodeString;
|
|
PasWideStringManager.CompareUnicodeStringProc:=@PasUnicodeCompareStr;
|
|
PasWideStringManager.CompareTextUnicodeStringProc:=@PasUnicodeCompareText;
|
|
|
|
SetUnicodeStringManager(PasWideStringManager);
|
|
end;
|
|
|
|
|
|
initialization
|
|
SetPasWideStringManager;
|
|
{$ENDIF}
|
|
end.
|