Adds paswstring to lazutils, it is feature complete now, although parts might need rethinking specially for fpc 2.7

git-svn-id: trunk@32889 -
This commit is contained in:
sekelsenmat 2011-10-14 19:09:04 +00:00
parent 84443e864a
commit 26e38aa25c
5 changed files with 743 additions and 2 deletions

1
.gitattributes vendored
View File

@ -1716,6 +1716,7 @@ components/lazutils/lazutf8.pas svneol=native#text/plain
components/lazutils/lazutils.lpk svneol=native#text/plain
components/lazutils/lazutils.pas svneol=native#text/plain
components/lazutils/luresstrings.pas svneol=native#text/plain
components/lazutils/paswstring.pas svneol=native#text/plain
components/leakview/heaptrcview.lfm svneol=native#text/plain
components/leakview/heaptrcview.pas svneol=native#text/plain
components/leakview/languages/heaptrcview.cs.po svneol=native#text/plain

View File

@ -87,6 +87,25 @@ function UTF8UpperCase(const AInStr: utf8string; ALocale: utf8string=''): utf8st
function UTF8CompareStr(const S1, S2: utf8string): Integer;
function UTF8CompareText(const S1, S2: utf8string): Integer;
type
TConvertResult = (trNoError, trNullSrc, trNullDest, trDestExhausted,
trInvalidChar, trUnfinishedChar);
TConvertOption = (toInvalidCharError, toInvalidCharToSymbol,
toUnfinishedCharError, toUnfinishedCharToSymbol);
TConvertOptions = set of TConvertOption;
function ConvertUTF8ToUTF16(Dest: PWideChar; DestWideCharCount: SizeUInt;
Src: PChar; SrcCharCount: SizeUInt; Options: TConvertOptions;
out ActualWideCharCount: SizeUInt): TConvertResult;
function ConvertUTF16ToUTF8(Dest: PChar; DestCharCount: SizeUInt;
Src: PWideChar; SrcWideCharCount: SizeUInt; Options: TConvertOptions;
out ActualCharCount: SizeUInt): TConvertResult;
function UTF8ToUTF16(const S: AnsiString): UnicodeString;
function UTF16ToUTF8(const S: UnicodeString): AnsiString;
var
FPUpChars: array[char] of char;
@ -2726,6 +2745,419 @@ begin
Result := UTF8CompareStr(S1Lower, S2Lower);
end;
{------------------------------------------------------------------------------
Name: ConvertUTF8ToUTF16
Params: Dest - Pointer to destination string
DestWideCharCount - Wide char count allocated in destination string
Src - Pointer to source string
SrcCharCount - Char count allocated in source string
Options - Conversion options, if none is set, both
invalid and unfinished source chars are skipped
toInvalidCharError - Stop on invalid source char and report
error
toInvalidCharToSymbol - Replace invalid source chars with '?'
toUnfinishedCharError - Stop on unfinished source char and
report error
toUnfinishedCharToSymbol - Replace unfinished source char with '?'
ActualWideCharCount - Actual wide char count converted from source
string to destination string
Returns:
trNoError - The string was successfully converted without
any error
trNullSrc - Pointer to source string is nil
trNullDest - Pointer to destination string is nil
trDestExhausted - Destination buffer size is not big enough to hold
converted string
trInvalidChar - Invalid source char has occured
trUnfinishedChar - Unfinished source char has occured
Converts the specified UTF-8 encoded string to UTF-16 encoded (system endian)
------------------------------------------------------------------------------}
function ConvertUTF8ToUTF16(Dest: PWideChar; DestWideCharCount: SizeUInt;
Src: PChar; SrcCharCount: SizeUInt; Options: TConvertOptions;
out ActualWideCharCount: SizeUInt): TConvertResult;
var
DestI, SrcI: SizeUInt;
B1, B2, B3, B4: Byte;
W: Word;
C: Cardinal;
function UnfinishedCharError: Boolean;
begin
if toUnfinishedCharToSymbol in Options then
begin
Dest[DestI] := System.WideChar('?');
Inc(DestI);
Result := False;
end
else
if toUnfinishedCharError in Options then
begin
ConvertUTF8ToUTF16 := trUnfinishedChar;
Result := True;
end
else Result := False;
end;
function InvalidCharError(Count: SizeUInt): Boolean; inline;
begin
if not (toInvalidCharError in Options) then
begin
if toInvalidCharToSymbol in Options then
begin
Dest[DestI] := System.WideChar('?');
Inc(DestI);
end;
Dec(SrcI, Count);
// skip trailing UTF-8 char bytes
while (Count > 0) do
begin
if (Byte(Src[SrcI]) and %11000000) <> %10000000 then Break;
Inc(SrcI);
Dec(Count);
end;
Result := False;
end
else
if toInvalidCharError in Options then
begin
ConvertUTF8ToUTF16 := trUnfinishedChar;
Result := True;
end;
end;
begin
ActualWideCharCount := 0;
if not Assigned(Src) then
begin
Result := trNullSrc;
Exit;
end;
if not Assigned(Dest) then
begin
Result := trNullDest;
Exit;
end;
SrcI := 0;
DestI := 0;
while (DestI < DestWideCharCount) and (SrcI < SrcCharCount) do
begin
B1 := Byte(Src[SrcI]);
Inc(SrcI);
if B1 < 128 then // single byte UTF-8 char
begin
Dest[DestI] := System.WideChar(B1);
Inc(DestI);
end
else
begin
if SrcI >= SrcCharCount then
if UnfinishedCharError then Exit(trInvalidChar)
else Break;
B2 := Byte(Src[SrcI]);
Inc(SrcI);
if (B1 and %11100000) = %11000000 then // double byte UTF-8 char
begin
if (B2 and %11000000) = %10000000 then
begin
Dest[DestI] := System.WideChar(((B1 and %00011111) shl 6) or (B2 and %00111111));
Inc(DestI);
end
else // invalid character, assume single byte UTF-8 char
if InvalidCharError(1) then Exit(trInvalidChar);
end
else
begin
if SrcI >= SrcCharCount then
if UnfinishedCharError then Exit(trInvalidChar)
else Break;
B3 := Byte(Src[SrcI]);
Inc(SrcI);
if (B1 and %11110000) = %11100000 then // triple byte UTF-8 char
begin
if ((B2 and %11000000) = %10000000) and ((B3 and %11000000) = %10000000) then
begin
W := ((B1 and %00011111) shl 12) or ((B2 and %00111111) shl 6) or (B3 and %00111111);
if (W < $D800) or (W > $DFFF) then // to single wide char UTF-16 char
begin
Dest[DestI] := System.WideChar(W);
Inc(DestI);
end
else // invalid UTF-16 character, assume double byte UTF-8 char
if InvalidCharError(2) then Exit(trInvalidChar);
end
else // invalid character, assume double byte UTF-8 char
if InvalidCharError(2) then Exit(trInvalidChar);
end
else
begin
if SrcI >= SrcCharCount then
if UnfinishedCharError then Exit(trInvalidChar)
else Break;
B4 := Byte(Src[SrcI]);
Inc(SrcI);
if ((B1 and %11111000) = %11110000) and ((B2 and %11000000) = %10000000)
and ((B3 and %11000000) = %10000000) and ((B4 and %11000000) = %10000000) then
begin // 4 byte UTF-8 char
C := ((B1 and %00011111) shl 18) or ((B2 and %00111111) shl 12)
or ((B3 and %00111111) shl 6) or (B4 and %00111111);
// to double wide char UTF-16 char
Dest[DestI] := System.WideChar($D800 or ((C - $10000) shr 10));
Inc(DestI);
if DestI >= DestWideCharCount then Break;
Dest[DestI] := System.WideChar($DC00 or ((C - $10000) and %0000001111111111));
Inc(DestI);
end
else // invalid character, assume triple byte UTF-8 char
if InvalidCharError(3) then Exit(trInvalidChar);
end;
end;
end;
end;
if DestI >= DestWideCharCount then
begin
DestI := DestWideCharCount - 1;
Result := trDestExhausted;
end
else
Result := trNoError;
Dest[DestI] := #0;
ActualWideCharCount := DestI + 1;
end;
{------------------------------------------------------------------------------
Name: ConvertUTF16ToUTF8
Params: Dest - Pointer to destination string
DestCharCount - Char count allocated in destination string
Src - Pointer to source string
SrcWideCharCount - Wide char count allocated in source string
Options - Conversion options, if none is set, both
invalid and unfinished source chars are skipped.
See ConvertUTF8ToUTF16 for details.
ActualCharCount - Actual char count converted from source
string to destination string
Returns: See ConvertUTF8ToUTF16
Converts the specified UTF-16 encoded string (system endian) to UTF-8 encoded
------------------------------------------------------------------------------}
function ConvertUTF16ToUTF8(Dest: PChar; DestCharCount: SizeUInt;
Src: PWideChar; SrcWideCharCount: SizeUInt; Options: TConvertOptions;
out ActualCharCount: SizeUInt): TConvertResult;
var
DestI, SrcI: SizeUInt;
W1, W2: Word;
C: Cardinal;
function UnfinishedCharError: Boolean;
begin
if toUnfinishedCharToSymbol in Options then
begin
Dest[DestI] := Char('?');
Inc(DestI);
Result := False;
end
else
if toUnfinishedCharError in Options then
begin
ConvertUTF16ToUTF8 := trUnfinishedChar;
Result := True;
end
else Result := False;
end;
function InvalidCharError(Count: SizeUInt): Boolean; inline;
begin
if not (toInvalidCharError in Options) then
begin
if toInvalidCharToSymbol in Options then
begin
Dest[DestI] := Char('?');
Inc(DestI);
end;
Dec(SrcI, Count);
// skip trailing UTF-16 wide char
if (Word(Src[SrcI]) and $FC00) = $DC00 then Inc(SrcI);
Result := False;
end
else
if toInvalidCharError in Options then
begin
ConvertUTF16ToUTF8 := trUnfinishedChar;
Result := True;
end;
end;
begin
ActualCharCount := 0;
if not Assigned(Src) then
begin
Result := trNullSrc;
Exit;
end;
if not Assigned(Dest) then
begin
Result := trNullDest;
Exit;
end;
SrcI := 0;
DestI := 0;
while (DestI < DestCharCount) and (SrcI < SrcWideCharCount) do
begin
W1 := Word(Src[SrcI]);
Inc(SrcI);
if (W1 < $D800) or (W1 > $DFFF) then // single wide char UTF-16 char
begin
if W1 < $0080 then // to single byte UTF-8 char
begin
Dest[DestI] := Char(W1);
Inc(DestI);
end
else
if W1 < $0800 then // to double byte UTF-8 char
begin
Dest[DestI] := Char(%11000000 or ((W1 and %11111000000) shr 6));
Inc(DestI);
if DestI >= DestCharCount then Break;
Dest[DestI] := Char(%10000000 or (W1 and %111111));
Inc(DestI);
end
else
begin // to triple byte UTF-8 char
Dest[DestI] := Char(%11100000 or ((W1 and %1111000000000000) shr 12));
Inc(DestI);
if DestI >= DestCharCount then Break;
Dest[DestI] := Char(%10000000 or ((W1 and %111111000000) shr 6));
Inc(DestI);
if DestI >= DestCharCount then Break;
Dest[DestI] := Char(%10000000 or (W1 and %111111));
Inc(DestI);
end;
end
else
begin
if SrcI >= SrcWideCharCount then
if UnfinishedCharError then Exit(trInvalidChar)
else Break;
W2 := Word(Src[SrcI]);
Inc(SrcI);
if (W1 and $F800) = $D800 then // double wide char UTF-16 char
begin
if (W2 and $FC00) = $DC00 then
begin
C := (W1 - $D800) shl 10 + (W2 - $DC00) + $10000;
// to 4 byte UTF-8 char
Dest[DestI] := Char(%11110000 or (C shr 18));
Inc(DestI);
if DestI >= DestCharCount then Break;
Dest[DestI] := Char(%10000000 or ((C and $3F000) shr 12));
Inc(DestI);
if DestI >= DestCharCount then Break;
Dest[DestI] := Char(%10000000 or ((C and %111111000000) shr 6));
Inc(DestI);
if DestI >= DestCharCount then Break;
Dest[DestI] := Char(%10000000 or (C and %111111));
Inc(DestI);
end
else // invalid character, assume single wide char UTF-16 char
if InvalidCharError(1) then Exit(trInvalidChar);
end
else // invalid character, assume single wide char UTF-16 char
if InvalidCharError(1) then Exit(trInvalidChar);
end;
end;
if DestI >= DestCharCount then
begin
DestI := DestCharCount - 1;
Result := trDestExhausted;
end
else
Result := trNoError;
Dest[DestI] := #0;
ActualCharCount := DestI + 1;
end;
{------------------------------------------------------------------------------
Name: UTF8ToUTF16
Params: S - Source UTF-8 string
Returns: UTF-16 encoded string
Converts the specified UTF-8 encoded string to UTF-16 encoded (system endian)
Avoid copying the result string since on windows a widestring requires a full
copy
------------------------------------------------------------------------------}
function UTF8ToUTF16(const S: AnsiString): UnicodeString;
var
L: SizeUInt;
begin
if S = ''
then begin
Result := '';
Exit;
end;
SetLength(Result, Length(S));
// wide chars of UTF-16 <= bytes of UTF-8 string
if ConvertUTF8ToUTF16(PWideChar(Result), Length(Result) + 1, PChar(S), Length(S),
[toInvalidCharToSymbol], L) = trNoError
then SetLength(Result, L - 1)
else Result := '';
end;
{------------------------------------------------------------------------------
Name: UTF16ToUTF8
Params: S - Source UTF-16 string (system endian)
Returns: UTF-8 encoded string
Converts the specified UTF-16 encoded string (system endian) to UTF-8 encoded
------------------------------------------------------------------------------}
function UTF16ToUTF8(const S: UnicodeString): AnsiString;
var
L: SizeUInt;
R: AnsiString;
begin
Result := '';
if S = '' then Exit;
SetLength(R, Length(S) * 3);
// bytes of UTF-8 <= 3 * wide chars of UTF-16 string
// e.g. %11100000 10100000 10000000 (UTF-8) is $0800 (UTF-16)
if ConvertUTF16ToUTF8(PChar(R), Length(R) + 1, PWideChar(S), Length(S),
[toInvalidCharToSymbol], L) = trNoError then
begin
SetLength(R, L - 1);
Result := R;
end;
end;
procedure InternalInit;
var
c: Char;

View File

@ -25,7 +25,7 @@
<Description Value="Useful units for Lazarus packages."/>
<License Value="Modified LGPL-2"/>
<Version Major="1"/>
<Files Count="16">
<Files Count="17">
<Item1>
<Filename Value="laz2_dom.pas"/>
<UnitName Value="laz2_DOM"/>
@ -90,6 +90,11 @@
<Filename Value="lazdbglog.pas"/>
<UnitName Value="LazDbgLog"/>
</Item16>
<Item17>
<Filename Value="paswstring.pas"/>
<AddToUsesPkgSection Value="False"/>
<UnitName Value="paswstring"/>
</Item17>
</Files>
<LazDoc Paths="docs"/>
<i18n>

View File

@ -9,7 +9,8 @@ interface
uses
laz2_DOM, Laz2_XMLCfg, laz2_XMLRead, laz2_xmlutils, laz2_XMLWrite, Laz_DOM,
Laz_XMLCfg, Laz_XMLRead, Laz_XMLStreaming, Laz_XMLWrite, LazFileUtils,
LazFileCache, LUResStrings, LazUTF8, LazDbgLog, LazarusPackageIntf;
LazFileCache, LUResStrings, LazUTF8, LazDbgLog, paswstring,
LazarusPackageIntf;
implementation

View File

@ -0,0 +1,302 @@
{
*****************************************************************************
* *
* This file is part of the LazUtils package *
* *
* See the file COPYING.modifiedLGPL.txt, included in this distribution, *
* for details about the copyright. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. *
* *
*****************************************************************************
}
unit paswstring;
{$mode objfpc}
{$inline on}
interface
uses SysUtils, lazutf8;
procedure SetPasWidestringManager;
implementation
procedure fpc_rangeerror; [external name 'FPC_RANGEERROR'];
procedure Wide2AnsiMove(source:pwidechar;var dest:ansistring;len:SizeInt);
var
widestr: widestring;
begin
// Copy the originating string taking into account the specified length
SetLength(widestr, len+1);
System.Move(source^, widestr, len);
widestr[len+1] := #0;
// Now convert it, using UTF-8 -> UTF-16
dest := UTF16ToUTF8(widestr);
end;
procedure Ansi2WideMove(source:pchar;var dest:widestring;len:SizeInt);
var
ansistr: ansistring;
begin
// Copy the originating string taking into account the specified length
SetLength(ansistr, len+1);
System.Move(source^, ansistr, len);
ansistr[len+1] := #0;
// Now convert it, using UTF-16 -> UTF-8
dest := UTF8ToUTF16(ansistr);
end;
function LowerWideString(const s : WideString) : WideString;
var
str: utf8string;
begin
str := UTF16ToUTF8(s);
str := UTF8LowerCase(str);
Result := UTF8ToUTF16(str);
end;
function UpperWideString(const s : WideString) : WideString;
var
str: utf8string;
begin
str := UTF16ToUTF8(s);
str := UTF8UpperCase(str);
Result := UTF8ToUTF16(str);
end;
procedure EnsureAnsiLen(var S: AnsiString; const len: SizeInt); inline;
begin
if (len>length(s)) then
if (length(s) < 10*256) then
setlength(s,length(s)+10)
else
setlength(s,length(s)+length(s) shr 8);
end;
procedure ConcatCharToAnsiStr(const c: char; var S: AnsiString; var index: SizeInt);
begin
EnsureAnsiLen(s,index);
pchar(@s[index])^:=c;
inc(index);
end;
function LowerAnsiString(const s : AnsiString) : AnsiString;
var
Str: utf8string;
begin
Str := SysToUTF8(s);
Str := UTF8LowerCase(Str);
Result := UTF8ToSys(Str);
end;
function UpperAnsiString(const s : AnsiString) : AnsiString;
var
Str: utf8string;
begin
Str := SysToUTF8(s);
Str := UTF8UpperCase(Str);
Result := UTF8ToSys(Str);
end;
// Just do a simple byte comparison
// A more complex analysis would require normalization
function WideCompareStr(const s1, s2 : WideString) : PtrInt;
var
count, count1, count2: integer;
begin
result := 0;
Count1 := Length(S1);
Count2 := Length(S2);
if Count1>Count2 then
Count:=Count2
else
Count:=Count1;
result := System.CompareMemRange(Pointer(S1),Pointer(S2), Count*2);
if result=0 then
result:=Count1-Count2;
end;
function WideCompareText(const s1, s2 : WideString): PtrInt;
var
a, b: AnsiString;
begin
a:=LowerWidestring(s1);
b:=LowerWidestring(s2);
result := WideCompareStr(a,b);
end;
function CharLengthPChar(const Str: PChar): PtrInt;
begin
Result := UTF8CharacterLength(Str);
end;
function AnsiCompareStr(const s1, s2: ansistring): PtrInt;
begin
Result := System.CompareStr(s1, s2);
end;
// Similar to AnsiCompareStr, but with PChar
function StrCompAnsi(s1,s2 : PChar): PtrInt;
var
ansi1, ansi2: ansistring;
begin
ansi1 := StrPas(S1);
ansi2 := StrPas(S2);
Result := System.CompareStr(ansi1, ansi2);
end;
function AnsiCompareText(const S1, S2: ansistring): PtrInt;
var
str1, str2: utf8string;
begin
str1 := SysToUTF8(S1);
str2 := SysToUTF8(S2);
Result := UTF8CompareText(str1, str2);
end;
function AnsiStrIComp(S1, S2: PChar): PtrInt;
begin
Result := AnsiCompareText(StrPas(s1),StrPas(s2));
end;
function AnsiStrLComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
var
a, b: pchar;
begin
Result := 0;
if (maxlen=0) then
exit(0);
if (s1[maxlen]<>#0) then
begin
getmem(a,maxlen+1);
move(s1^,a^,maxlen);
a[maxlen]:=#0;
end
else
a:=s1;
if (s2[maxlen]<>#0) then
begin
getmem(b,maxlen+1);
move(s2^,b^,maxlen);
b[maxlen]:=#0;
end
else
b:=s2;
result:=StrCompAnsi(a,b);
if (a<>s1) then
freemem(a);
if (b<>s2) then
freemem(b);
end;
function AnsiStrLIComp(S1, S2: PChar; MaxLen: PtrUInt): PtrInt;
var
a, b: ansistring;
begin
if (maxlen=0) then
exit(0);
setlength(a,maxlen);
move(s1^,a[1],maxlen);
setlength(b,maxlen);
move(s2^,b[1],maxlen);
result:=AnsiCompareText(a,b);
end;
procedure ansi2pchar(const s: ansistring; const orgp: pchar; out p: pchar);
var
newlen: sizeint;
begin
newlen:=length(s);
if newlen>strlen(orgp) then
fpc_rangeerror;
p:=orgp;
if (newlen>0) then
move(s[1],p[0],newlen);
p[newlen]:=#0;
end;
function AnsiStrLower(Str: PChar): PChar;
var
temp: ansistring;
begin
temp:=loweransistring(str);
ansi2pchar(temp,str,result);
end;
function AnsiStrUpper(Str: PChar): PChar;
var
temp: ansistring;
begin
temp:=upperansistring(str);
ansi2pchar(temp,str,result);
end;
procedure InitThread;
begin
end;
procedure FiniThread;
begin
end;
Procedure SetPasWideStringManager;
Var
PasWideStringManager : TUnicodeStringManager;
begin
PasWideStringManager:=widestringmanager;
PasWideStringManager.Wide2AnsiMoveProc:=@Wide2AnsiMove;
PasWideStringManager.Ansi2WideMoveProc:=@Ansi2WideMove;
PasWideStringManager.UpperWideStringProc:=@UpperWideString;
PasWideStringManager.LowerWideStringProc:=@LowerWideString;
PasWideStringManager.CompareWideStringProc:=@WideCompareStr;
PasWideStringManager.CompareTextWideStringProc:=@WideCompareText;
PasWideStringManager.CharLengthPCharProc:=@CharLengthPChar;
PasWideStringManager.UpperAnsiStringProc:=@UpperAnsiString;
PasWideStringManager.LowerAnsiStringProc:=@LowerAnsiString;
PasWideStringManager.CompareStrAnsiStringProc:=@AnsiCompareStr;
PasWideStringManager.CompareTextAnsiStringProc:=@AnsiCompareText;
PasWideStringManager.StrCompAnsiStringProc:=@StrCompAnsi;
PasWideStringManager.StrICompAnsiStringProc:=@AnsiStrIComp;
PasWideStringManager.StrLCompAnsiStringProc:=@AnsiStrLComp;
PasWideStringManager.StrLICompAnsiStringProc:=@AnsiStrLIComp;
PasWideStringManager.StrLowerAnsiStringProc:=@AnsiStrLower;
PasWideStringManager.StrUpperAnsiStringProc:=@AnsiStrUpper;
PasWideStringManager.ThreadInitProc:=@InitThread;
PasWideStringManager.ThreadFiniProc:=@FiniThread;
{ Unicode }
PasWideStringManager.Unicode2AnsiMoveProc:=@Wide2AnsiMove;
PasWideStringManager.Ansi2UnicodeMoveProc:=@Ansi2WideMove;
PasWideStringManager.UpperUnicodeStringProc:=@UpperWideString;
PasWideStringManager.LowerUnicodeStringProc:=@LowerWideString;
SetUnicodeStringManager(PasWideStringManager);
end;
initialization
SetPasWideStringManager;
end.