mirror of
https://gitlab.com/freepascal.org/lazarus/lazarus.git
synced 2025-09-01 14:00:18 +02:00
Redirects many utf8 routines in lclproc to use lazutf8, to avoid code duplication
git-svn-id: trunk@32987 -
This commit is contained in:
parent
b3f587b60f
commit
190bd0e68d
853
lcl/lclproc.pas
853
lcl/lclproc.pas
@ -343,6 +343,8 @@ function ValidUTF8String(const s: String): String;
|
||||
|
||||
procedure AssignUTF8ListToAnsi(UTF8List, AnsiList: TStrings);
|
||||
|
||||
// Felipe: Don't substitute with calls to lazutf16 because lazutf16 includes
|
||||
// some initialization code and tables, which are not necessary for the LCL
|
||||
function UTF16CharacterLength(p: PWideChar): integer;
|
||||
function UTF16Length(const s: widestring): PtrInt;
|
||||
function UTF16Length(p: PWideChar; WordCount: PtrInt): PtrInt;
|
||||
@ -355,12 +357,10 @@ function UTF8CompareStr(const S1, S2: String): Integer;
|
||||
function UTF8CompareText(const S1, S2: String): Integer;
|
||||
|
||||
type
|
||||
TConvertResult = (trNoError, trNullSrc, trNullDest, trDestExhausted,
|
||||
trInvalidChar, trUnfinishedChar);
|
||||
TConvertResult = LazUTF8.TConvertResult;
|
||||
|
||||
TConvertOption = (toInvalidCharError, toInvalidCharToSymbol,
|
||||
toUnfinishedCharError, toUnfinishedCharToSymbol);
|
||||
TConvertOptions = set of TConvertOption;
|
||||
TConvertOption = LazUTF8.TConvertOption;
|
||||
TConvertOptions = LazUTF8.TConvertOptions;
|
||||
|
||||
function ConvertUTF8ToUTF16(Dest: PWideChar; DestWideCharCount: SizeUInt;
|
||||
Src: PChar; SrcCharCount: SizeUInt; Options: TConvertOptions;
|
||||
@ -2649,107 +2649,17 @@ end;
|
||||
|
||||
function UTF8CharacterToUnicode(p: PChar; out CharLen: integer): Cardinal;
|
||||
begin
|
||||
if p<>nil then begin
|
||||
if ord(p^)<%11000000 then begin
|
||||
// regular single byte character (#0 is a normal char, this is pascal ;)
|
||||
Result:=ord(p^);
|
||||
CharLen:=1;
|
||||
end
|
||||
else if ((ord(p^) and %11100000) = %11000000) then begin
|
||||
// could be double byte character
|
||||
if (ord(p[1]) and %11000000) = %10000000 then begin
|
||||
Result:=((ord(p^) and %00011111) shl 6)
|
||||
or (ord(p[1]) and %00111111);
|
||||
CharLen:=2;
|
||||
end else begin
|
||||
Result:=ord(p^);
|
||||
CharLen:=1;
|
||||
end;
|
||||
end
|
||||
else if ((ord(p^) and %11110000) = %11100000) then begin
|
||||
// could be triple byte character
|
||||
if ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000) then begin
|
||||
Result:=((ord(p^) and %00011111) shl 12)
|
||||
or ((ord(p[1]) and %00111111) shl 6)
|
||||
or (ord(p[2]) and %00111111);
|
||||
CharLen:=3;
|
||||
end else begin
|
||||
Result:=ord(p^);
|
||||
CharLen:=1;
|
||||
end;
|
||||
end
|
||||
else if ((ord(p^) and %11111000) = %11110000) then begin
|
||||
// could be 4 byte character
|
||||
if ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000)
|
||||
and ((ord(p[3]) and %11000000) = %10000000) then begin
|
||||
Result:=((ord(p^) and %00001111) shl 18)
|
||||
or ((ord(p[1]) and %00111111) shl 12)
|
||||
or ((ord(p[2]) and %00111111) shl 6)
|
||||
or (ord(p[3]) and %00111111);
|
||||
CharLen:=4;
|
||||
end else begin
|
||||
Result:=ord(p^);
|
||||
CharLen:=1;
|
||||
end;
|
||||
end
|
||||
else begin
|
||||
// invalid character
|
||||
Result:=ord(p^);
|
||||
CharLen:=1;
|
||||
end;
|
||||
end else begin
|
||||
Result:=0;
|
||||
CharLen:=0;
|
||||
end;
|
||||
Result := LazUTF8.UTF8CharacterToUnicode(p, CharLen);
|
||||
end;
|
||||
|
||||
function UnicodeToUTF8(u: cardinal; Buf: PChar): integer;
|
||||
|
||||
procedure RaiseInvalidUnicode;
|
||||
begin
|
||||
raise Exception.Create('UnicodeToUTF8: invalid unicode: '+IntToStr(u));
|
||||
end;
|
||||
|
||||
begin
|
||||
Result:=UnicodeToUTF8SkipErrors(u,Buf);
|
||||
if Result=0 then
|
||||
RaiseInvalidUnicode;
|
||||
Result := LazUTF8.UnicodeToUTF8(u, Buf);
|
||||
end;
|
||||
|
||||
function UnicodeToUTF8SkipErrors(u: cardinal; Buf: PChar): integer;
|
||||
begin
|
||||
case u of
|
||||
0..$7f:
|
||||
begin
|
||||
Result:=1;
|
||||
Buf[0]:=char(byte(u));
|
||||
end;
|
||||
$80..$7ff:
|
||||
begin
|
||||
Result:=2;
|
||||
Buf[0]:=char(byte($c0 or (u shr 6)));
|
||||
Buf[1]:=char(byte($80 or (u and $3f)));
|
||||
end;
|
||||
$800..$ffff:
|
||||
begin
|
||||
Result:=3;
|
||||
Buf[0]:=char(byte($e0 or (u shr 12)));
|
||||
Buf[1]:=char(byte((u shr 6) and $3f) or $80);
|
||||
Buf[2]:=char(byte(u and $3f) or $80);
|
||||
end;
|
||||
$10000..$10ffff:
|
||||
begin
|
||||
Result:=4;
|
||||
Buf[0]:=char(byte($f0 or (u shr 18)));
|
||||
Buf[1]:=char(byte((u shr 12) and $3f) or $80);
|
||||
Buf[2]:=char(byte((u shr 6) and $3f) or $80);
|
||||
Buf[3]:=char(byte(u and $3f) or $80);
|
||||
end;
|
||||
else
|
||||
Result:=0;
|
||||
end;
|
||||
Result := LazUTF8.UnicodeToUTF8SkipErrors(u, Buf);
|
||||
end;
|
||||
|
||||
function UnicodeToUTF8(u: cardinal): shortstring;
|
||||
@ -2758,36 +2668,14 @@ begin
|
||||
end;
|
||||
|
||||
function UTF8ToDoubleByteString(const s: string): string;
|
||||
var
|
||||
Len: Integer;
|
||||
begin
|
||||
Len:=UTF8Length(s);
|
||||
SetLength(Result,Len*2);
|
||||
if Len=0 then exit;
|
||||
UTF8ToDoubleByte(PChar(s),length(s),PByte(Result));
|
||||
Result := LazUTF8.UTF8ToDoubleByteString(s);
|
||||
end;
|
||||
|
||||
{ returns number of double bytes }
|
||||
function UTF8ToDoubleByte(UTF8Str: PChar; Len: PtrInt; DBStr: PByte): PtrInt;
|
||||
var
|
||||
SrcPos: PChar;
|
||||
CharLen: LongInt;
|
||||
DestPos: PByte;
|
||||
u: Cardinal;
|
||||
begin
|
||||
SrcPos:=UTF8Str;
|
||||
DestPos:=DBStr;
|
||||
Result:=0;
|
||||
while Len>0 do begin
|
||||
u:=UTF8CharacterToUnicode(SrcPos,CharLen);
|
||||
DestPos^:=byte((u shr 8) and $ff);
|
||||
inc(DestPos);
|
||||
DestPos^:=byte(u and $ff);
|
||||
inc(DestPos);
|
||||
inc(SrcPos,CharLen);
|
||||
dec(Len,CharLen);
|
||||
inc(Result);
|
||||
end;
|
||||
Result := LazUTF8.UTF8ToDoubleByte(UTF8Str, Len, DBStr);
|
||||
end;
|
||||
|
||||
{ Find the start of the UTF8 character which contains BytePos,
|
||||
@ -2795,39 +2683,7 @@ end;
|
||||
function UTF8FindNearestCharStart(UTF8Str: PChar; Len: integer;
|
||||
BytePos: integer): integer;
|
||||
begin
|
||||
Result:=0;
|
||||
if (UTF8Str<>nil) and (Len>0) and (BytePos>=0) then begin
|
||||
Result:=BytePos;
|
||||
if Result>Len then Result:=Len-1;
|
||||
if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
|
||||
dec(Result);
|
||||
if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
|
||||
dec(Result);
|
||||
if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
|
||||
dec(Result);
|
||||
// should be four byte character
|
||||
if (ord(UTF8Str[Result]) and %11111000<>%11110000) then begin
|
||||
// broken UTF8 character
|
||||
inc(Result,3);
|
||||
end else begin
|
||||
// is four byte character
|
||||
end;
|
||||
end else if (ord(UTF8Str[Result]) and %11110000<>%11100000) then begin
|
||||
// broken UTF8 character, should be three byte
|
||||
inc(Result,2);
|
||||
end else
|
||||
begin
|
||||
// is three byte character
|
||||
end;
|
||||
end else if (ord(UTF8Str[Result]) and %11100000<>%11000000) then begin
|
||||
// broken UTF8 character, should be two byte
|
||||
inc(Result);
|
||||
end else
|
||||
begin
|
||||
// is two byte character
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
Result := LazUTF8.UTF8FindNearestCharStart(UTF8Str, Len, BytePos);
|
||||
end;
|
||||
|
||||
{ Len is the length in bytes of UTF8Str
|
||||
@ -2836,234 +2692,49 @@ end;
|
||||
This function is similar to UTF8FindNearestCharStart
|
||||
}
|
||||
function UTF8CharStart(UTF8Str: PChar; Len, CharIndex: PtrInt): PChar;
|
||||
var
|
||||
CharLen: LongInt;
|
||||
begin
|
||||
Result:=UTF8Str;
|
||||
if Result<>nil then begin
|
||||
while (CharIndex>0) and (Len>0) do begin
|
||||
CharLen:=UTF8CharacterLength(Result);
|
||||
dec(Len,CharLen);
|
||||
dec(CharIndex);
|
||||
inc(Result,CharLen);
|
||||
end;
|
||||
if (CharIndex>0) or (Len<0) then
|
||||
Result:=nil;
|
||||
end;
|
||||
Result := LazUTF8.UTF8CharStart(UTF8Str, Len, CharIndex);
|
||||
end;
|
||||
|
||||
function UTF8CharToByteIndex(UTF8Str: PChar; Len, CharIndex: PtrInt): PtrInt;
|
||||
var
|
||||
p: PChar;
|
||||
begin
|
||||
p := UTF8CharStart(UTF8Str, Len, CharIndex);
|
||||
if p = nil
|
||||
then Result := -1
|
||||
else Result := p - UTF8Str;
|
||||
Result := LazUTF8.UTF8CharToByteIndex(UTF8Str, Len, CharIndex);
|
||||
end;
|
||||
|
||||
{ fix any broken UTF8 sequences with spaces }
|
||||
procedure UTF8FixBroken(P: PChar);
|
||||
begin
|
||||
if p=nil then exit;
|
||||
while p^<>#0 do begin
|
||||
if ord(p^)<%10000000 then begin
|
||||
// regular single byte character
|
||||
inc(p);
|
||||
end
|
||||
else if ord(p^)<%11000000 then begin
|
||||
// invalid
|
||||
p^:=' ';
|
||||
inc(p);
|
||||
end
|
||||
else if ((ord(p^) and %11100000) = %11000000) then begin
|
||||
// should be 2 byte character
|
||||
if (ord(p[1]) and %11000000) = %10000000 then
|
||||
inc(p,2)
|
||||
else if p[1]<>#0 then
|
||||
p^:=' ';
|
||||
end
|
||||
else if ((ord(p^) and %11110000) = %11100000) then begin
|
||||
// should be 3 byte character
|
||||
if ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000) then
|
||||
inc(p,3)
|
||||
else
|
||||
p^:=' ';
|
||||
end
|
||||
else if ((ord(p^) and %11111000) = %11110000) then begin
|
||||
// should be 4 byte character
|
||||
if ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000)
|
||||
and ((ord(p[3]) and %11000000) = %10000000) then
|
||||
inc(p,4)
|
||||
else
|
||||
p^:=' ';
|
||||
end
|
||||
else begin
|
||||
p^:=' ';
|
||||
inc(p);
|
||||
end;
|
||||
end;
|
||||
LazUTF8.UTF8FixBroken(P);
|
||||
end;
|
||||
|
||||
function UTF8CharacterStrictLength(P: PChar): integer;
|
||||
begin
|
||||
if p=nil then exit(0);
|
||||
if ord(p^)<%10000000 then begin
|
||||
// regular single byte character
|
||||
exit(1);
|
||||
end
|
||||
else if ord(p^)<%11000000 then begin
|
||||
// invalid single byte character
|
||||
exit(0);
|
||||
end
|
||||
else if ((ord(p^) and %11100000) = %11000000) then begin
|
||||
// should be 2 byte character
|
||||
if (ord(p[1]) and %11000000) = %10000000 then
|
||||
exit(2)
|
||||
else
|
||||
exit(0);
|
||||
end
|
||||
else if ((ord(p^) and %11110000) = %11100000) then begin
|
||||
// should be 3 byte character
|
||||
if ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000) then
|
||||
exit(3)
|
||||
else
|
||||
exit(0);
|
||||
end
|
||||
else if ((ord(p^) and %11111000) = %11110000) then begin
|
||||
// should be 4 byte character
|
||||
if ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000)
|
||||
and ((ord(p[3]) and %11000000) = %10000000) then
|
||||
exit(4)
|
||||
else
|
||||
exit(0);
|
||||
end else
|
||||
exit(0);
|
||||
Result := LazUTF8.UTF8CharacterStrictLength(P);
|
||||
end;
|
||||
|
||||
function UTF8CStringToUTF8String(SourceStart: PChar; SourceLen: PtrInt) : string;
|
||||
var
|
||||
Source: PChar;
|
||||
Dest: PChar;
|
||||
SourceEnd: PChar;
|
||||
CharLen: integer;
|
||||
SourceCopied: PChar;
|
||||
|
||||
// Copies from SourceStart till Source to Dest and updates Dest
|
||||
procedure CopyPart; inline;
|
||||
var
|
||||
CopyLength: SizeInt;
|
||||
begin
|
||||
CopyLength := Source - SourceCopied;
|
||||
if CopyLength=0 then exit;
|
||||
System.move(SourceCopied^ , Dest^, CopyLength);
|
||||
SourceCopied:=Source;
|
||||
inc(Dest, CopyLength);
|
||||
end;
|
||||
|
||||
begin
|
||||
SetLength(Result, SourceLen);
|
||||
if SourceLen=0 then exit;
|
||||
SourceCopied:=SourceStart;
|
||||
Source:=SourceStart;
|
||||
Dest:=PChar(Result);
|
||||
SourceEnd := Source + SourceLen;
|
||||
while Source<SourceEnd do begin
|
||||
CharLen := UTF8CharacterLength(Source);
|
||||
if (CharLen=1) and (Source^='\') then begin
|
||||
CopyPart;
|
||||
inc(Source);
|
||||
if Source^ in ['t', 'n', '"', '\'] then begin
|
||||
case Source^ of
|
||||
't' : Dest^ := #9;
|
||||
'"' : Dest^ := '"';
|
||||
'\' : Dest^ := '\';
|
||||
'n' :
|
||||
// fpc 2.1.1 stores string constants as array of char so maybe this
|
||||
// will work for without ifdef (once available in 2.0.x too):
|
||||
// move(lineending, dest^, sizeof(LineEnding));
|
||||
{$IFDEF WINDOWS}
|
||||
begin
|
||||
move(lineending[1], dest^, length(LineEnding));
|
||||
inc(dest, length(LineEnding)-1);
|
||||
end;
|
||||
{$ELSE}
|
||||
Dest^ := LineEnding;
|
||||
{$ENDIF}
|
||||
end;
|
||||
inc(Source);
|
||||
inc(Dest);
|
||||
end;
|
||||
SourceCopied := Source;
|
||||
end
|
||||
else
|
||||
Inc(Source, CharLen);
|
||||
end;
|
||||
CopyPart;
|
||||
SetLength(Result, Dest - PChar(Result));
|
||||
Result := LazUTF8.UTF8CStringToUTF8String(SourceStart, SourceLen);
|
||||
end;
|
||||
|
||||
function UTF8Pos(const SearchForText, SearchInText: string): PtrInt;
|
||||
// returns the character index, where the SearchForText starts in SearchInText
|
||||
var
|
||||
p: LongInt;
|
||||
begin
|
||||
p:=System.Pos(SearchForText,SearchInText);
|
||||
if p>0 then
|
||||
Result:=UTF8Length(PChar(SearchInText),p-1)+1
|
||||
else
|
||||
Result:=0;
|
||||
Result := LazUTF8.UTF8Pos(SearchForText, SearchInText);
|
||||
end;
|
||||
|
||||
function UTF8Copy(const s: string; StartCharIndex, CharCount: PtrInt): string;
|
||||
// returns substring
|
||||
var
|
||||
StartBytePos: PChar;
|
||||
EndBytePos: PChar;
|
||||
MaxBytes: PtrInt;
|
||||
begin
|
||||
StartBytePos:=UTF8CharStart(PChar(s),length(s),StartCharIndex-1);
|
||||
if StartBytePos=nil then
|
||||
Result:=''
|
||||
else begin
|
||||
MaxBytes:=PtrInt(PChar(s)+length(s)-StartBytePos);
|
||||
EndBytePos:=UTF8CharStart(StartBytePos,MaxBytes,CharCount);
|
||||
if EndBytePos=nil then
|
||||
Result:=copy(s,StartBytePos-PChar(s)+1,MaxBytes)
|
||||
else
|
||||
Result:=copy(s,StartBytePos-PChar(s)+1,EndBytePos-StartBytePos);
|
||||
end;
|
||||
Result := LazUTF8.UTF8Copy(s, StartCharIndex, CharCount);
|
||||
end;
|
||||
|
||||
procedure UTF8Delete(var s: String; StartCharIndex, CharCount: PtrInt);
|
||||
var
|
||||
StartBytePos: PChar;
|
||||
EndBytePos: PChar;
|
||||
MaxBytes: PtrInt;
|
||||
begin
|
||||
StartBytePos:=UTF8CharStart(PChar(s),length(s),StartCharIndex-1);
|
||||
if StartBytePos <> nil then
|
||||
begin
|
||||
MaxBytes:=PtrInt(PChar(s)+length(s)-StartBytePos);
|
||||
EndBytePos:=UTF8CharStart(StartBytePos,MaxBytes,CharCount);
|
||||
if EndBytePos=nil then
|
||||
Delete(s,StartBytePos-PChar(s)+1,MaxBytes)
|
||||
else
|
||||
Delete(s,StartBytePos-PChar(s)+1,EndBytePos-StartBytePos);
|
||||
end;
|
||||
LazUTF8.UTF8Delete(s, StartCharIndex, CharCount);
|
||||
end;
|
||||
|
||||
procedure UTF8Insert(const source: String; var s: string; StartCharIndex: PtrInt);
|
||||
var
|
||||
StartBytePos: PChar;
|
||||
begin
|
||||
StartBytePos:=UTF8CharStart(PChar(s),length(s),StartCharIndex-1);
|
||||
if StartBytePos <> nil then
|
||||
Insert(source, s, StartBytePos-PChar(s)+1);
|
||||
LazUTF8.UTF8Insert(source, s, StartCharIndex);
|
||||
end;
|
||||
|
||||
function UTF8LowerCase(const s: String): String;
|
||||
@ -3079,110 +2750,18 @@ end;
|
||||
function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
|
||||
StopOnNonASCII: Boolean): PtrInt;
|
||||
// return -1 if ok
|
||||
var
|
||||
CharLen: Integer;
|
||||
begin
|
||||
if (p<>nil) then begin
|
||||
Result:=0;
|
||||
while Result<Count do begin
|
||||
if ord(p^)<128 then begin
|
||||
// regular single byte ASCII character (#0 is a character, this is pascal ;)
|
||||
CharLen:=1;
|
||||
end
|
||||
else if ord(p^)<%11000000 then begin
|
||||
// regular single byte character
|
||||
if StopOnNonASCII then
|
||||
exit;
|
||||
CharLen:=1;
|
||||
end
|
||||
else if ((ord(p^) and %11100000) = %11000000) then begin
|
||||
// could be 2 byte character
|
||||
if (Result<Count-1) and ((ord(p[1]) and %11000000) = %10000000) then
|
||||
CharLen:=2
|
||||
else
|
||||
exit; // missing following bytes
|
||||
end
|
||||
else if ((ord(p^) and %11110000) = %11100000) then begin
|
||||
// could be 3 byte character
|
||||
if (Result<Count-2) and ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000) then
|
||||
CharLen:=3
|
||||
else
|
||||
exit; // missing following bytes
|
||||
end
|
||||
else if ((ord(p^) and %11111000) = %11110000) then begin
|
||||
// could be 4 byte character
|
||||
if (Result<Count-3) and ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000)
|
||||
and ((ord(p[3]) and %11000000) = %10000000) then
|
||||
CharLen:=4
|
||||
else
|
||||
exit; // missing following bytes
|
||||
end
|
||||
else begin
|
||||
if StopOnNonASCII then
|
||||
exit;
|
||||
CharLen:=1;
|
||||
end;
|
||||
inc(Result,CharLen);
|
||||
inc(p,CharLen);
|
||||
if Result>Count then begin
|
||||
dec(Result,CharLen);
|
||||
exit; // missing following bytes
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
// ok
|
||||
Result:=-1;
|
||||
Result := LazUTF8.FindInvalidUTF8Character(p, Count, StopOnNonASCII);
|
||||
end;
|
||||
|
||||
function ValidUTF8String(const s: String): String;
|
||||
var
|
||||
p, cur: PChar;
|
||||
l, lr: integer;
|
||||
NeedFree: Boolean;
|
||||
begin
|
||||
if FindInvalidUTF8Character(PChar(s), Length(s)) <> -1 then
|
||||
begin
|
||||
NeedFree := True;
|
||||
GetMem(p, Length(s) + 1);
|
||||
StrPCopy(p, s);
|
||||
UTF8FixBroken(p);
|
||||
end
|
||||
else
|
||||
begin
|
||||
p := PChar(s);
|
||||
NeedFree := False;
|
||||
end;
|
||||
|
||||
Result := '';
|
||||
cur := p;
|
||||
while cur^ <> #0 do
|
||||
begin
|
||||
l := UTF8CharacterLength(cur);
|
||||
if (l = 1) and (cur^ < #32) then
|
||||
Result := Result + '#' + IntToStr(Ord(cur^))
|
||||
else
|
||||
begin
|
||||
lr := Length(Result);
|
||||
SetLength(Result, lr + l);
|
||||
System.Move(cur^, Result[lr + 1], l);
|
||||
end;
|
||||
inc(cur, l)
|
||||
end;
|
||||
|
||||
if NeedFree then
|
||||
FreeMem(p);
|
||||
Result := LazUTF8.ValidUTF8String(s);
|
||||
end;
|
||||
|
||||
procedure AssignUTF8ListToAnsi(UTF8List, AnsiList: TStrings);
|
||||
var
|
||||
i: Integer;
|
||||
begin
|
||||
AnsiList.Clear;
|
||||
if UTF8List=nil then exit;
|
||||
for i:=0 to UTF8List.Count-1 do
|
||||
AnsiList.Add(UTF8ToSys(UTF8List[i]));
|
||||
LazUTF8.AssignUTF8ListToAnsi(UTF8List, AnsiList);
|
||||
end;
|
||||
|
||||
function UTF16CharacterLength(p: PWideChar): integer;
|
||||
@ -3324,168 +2903,9 @@ end;
|
||||
function ConvertUTF8ToUTF16(Dest: PWideChar; DestWideCharCount: SizeUInt;
|
||||
Src: PChar; SrcCharCount: SizeUInt; Options: TConvertOptions;
|
||||
out ActualWideCharCount: SizeUInt): TConvertResult;
|
||||
var
|
||||
DestI, SrcI: SizeUInt;
|
||||
B1, B2, B3, B4: Byte;
|
||||
W: Word;
|
||||
C: Cardinal;
|
||||
|
||||
function UnfinishedCharError: Boolean;
|
||||
begin
|
||||
if toUnfinishedCharToSymbol in Options then
|
||||
begin
|
||||
Dest[DestI] := System.WideChar('?');
|
||||
Inc(DestI);
|
||||
Result := False;
|
||||
end
|
||||
else
|
||||
if toUnfinishedCharError in Options then
|
||||
begin
|
||||
ConvertUTF8ToUTF16 := trUnfinishedChar;
|
||||
Result := True;
|
||||
end
|
||||
else Result := False;
|
||||
end;
|
||||
|
||||
function InvalidCharError(Count: SizeUInt): Boolean; inline;
|
||||
begin
|
||||
if not (toInvalidCharError in Options) then
|
||||
begin
|
||||
if toInvalidCharToSymbol in Options then
|
||||
begin
|
||||
Dest[DestI] := System.WideChar('?');
|
||||
Inc(DestI);
|
||||
end;
|
||||
|
||||
Dec(SrcI, Count);
|
||||
|
||||
// skip trailing UTF-8 char bytes
|
||||
while (Count > 0) do
|
||||
begin
|
||||
if (Byte(Src[SrcI]) and %11000000) <> %10000000 then Break;
|
||||
Inc(SrcI);
|
||||
Dec(Count);
|
||||
end;
|
||||
|
||||
Result := False;
|
||||
end
|
||||
else
|
||||
if toInvalidCharError in Options then
|
||||
begin
|
||||
ConvertUTF8ToUTF16 := trUnfinishedChar;
|
||||
Result := True;
|
||||
end;
|
||||
end;
|
||||
|
||||
begin
|
||||
ActualWideCharCount := 0;
|
||||
|
||||
if not Assigned(Src) then
|
||||
begin
|
||||
Result := trNullSrc;
|
||||
Exit;
|
||||
end;
|
||||
|
||||
if not Assigned(Dest) then
|
||||
begin
|
||||
Result := trNullDest;
|
||||
Exit;
|
||||
end;
|
||||
SrcI := 0;
|
||||
DestI := 0;
|
||||
|
||||
while (DestI < DestWideCharCount) and (SrcI < SrcCharCount) do
|
||||
begin
|
||||
B1 := Byte(Src[SrcI]);
|
||||
Inc(SrcI);
|
||||
|
||||
if B1 < 128 then // single byte UTF-8 char
|
||||
begin
|
||||
Dest[DestI] := System.WideChar(B1);
|
||||
Inc(DestI);
|
||||
end
|
||||
else
|
||||
begin
|
||||
if SrcI >= SrcCharCount then
|
||||
if UnfinishedCharError then Exit(trInvalidChar)
|
||||
else Break;
|
||||
|
||||
B2 := Byte(Src[SrcI]);
|
||||
Inc(SrcI);
|
||||
|
||||
if (B1 and %11100000) = %11000000 then // double byte UTF-8 char
|
||||
begin
|
||||
if (B2 and %11000000) = %10000000 then
|
||||
begin
|
||||
Dest[DestI] := System.WideChar(((B1 and %00011111) shl 6) or (B2 and %00111111));
|
||||
Inc(DestI);
|
||||
end
|
||||
else // invalid character, assume single byte UTF-8 char
|
||||
if InvalidCharError(1) then Exit(trInvalidChar);
|
||||
end
|
||||
else
|
||||
begin
|
||||
if SrcI >= SrcCharCount then
|
||||
if UnfinishedCharError then Exit(trInvalidChar)
|
||||
else Break;
|
||||
|
||||
B3 := Byte(Src[SrcI]);
|
||||
Inc(SrcI);
|
||||
|
||||
if (B1 and %11110000) = %11100000 then // triple byte UTF-8 char
|
||||
begin
|
||||
if ((B2 and %11000000) = %10000000) and ((B3 and %11000000) = %10000000) then
|
||||
begin
|
||||
W := ((B1 and %00011111) shl 12) or ((B2 and %00111111) shl 6) or (B3 and %00111111);
|
||||
if (W < $D800) or (W > $DFFF) then // to single wide char UTF-16 char
|
||||
begin
|
||||
Dest[DestI] := System.WideChar(W);
|
||||
Inc(DestI);
|
||||
end
|
||||
else // invalid UTF-16 character, assume double byte UTF-8 char
|
||||
if InvalidCharError(2) then Exit(trInvalidChar);
|
||||
end
|
||||
else // invalid character, assume double byte UTF-8 char
|
||||
if InvalidCharError(2) then Exit(trInvalidChar);
|
||||
end
|
||||
else
|
||||
begin
|
||||
if SrcI >= SrcCharCount then
|
||||
if UnfinishedCharError then Exit(trInvalidChar)
|
||||
else Break;
|
||||
|
||||
B4 := Byte(Src[SrcI]);
|
||||
Inc(SrcI);
|
||||
|
||||
if ((B1 and %11111000) = %11110000) and ((B2 and %11000000) = %10000000)
|
||||
and ((B3 and %11000000) = %10000000) and ((B4 and %11000000) = %10000000) then
|
||||
begin // 4 byte UTF-8 char
|
||||
C := ((B1 and %00011111) shl 18) or ((B2 and %00111111) shl 12)
|
||||
or ((B3 and %00111111) shl 6) or (B4 and %00111111);
|
||||
// to double wide char UTF-16 char
|
||||
Dest[DestI] := System.WideChar($D800 or ((C - $10000) shr 10));
|
||||
Inc(DestI);
|
||||
if DestI >= DestWideCharCount then Break;
|
||||
Dest[DestI] := System.WideChar($DC00 or ((C - $10000) and %0000001111111111));
|
||||
Inc(DestI);
|
||||
end
|
||||
else // invalid character, assume triple byte UTF-8 char
|
||||
if InvalidCharError(3) then Exit(trInvalidChar);
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
|
||||
if DestI >= DestWideCharCount then
|
||||
begin
|
||||
DestI := DestWideCharCount - 1;
|
||||
Result := trDestExhausted;
|
||||
end
|
||||
else
|
||||
Result := trNoError;
|
||||
|
||||
Dest[DestI] := #0;
|
||||
ActualWideCharCount := DestI + 1;
|
||||
Result := LazUTF8.ConvertUTF8ToUTF16(Dest, DestWideCharCount,
|
||||
Src, SrcCharCount, Options, ActualWideCharCount);
|
||||
end;
|
||||
|
||||
{------------------------------------------------------------------------------
|
||||
@ -3507,148 +2927,9 @@ end;
|
||||
function ConvertUTF16ToUTF8(Dest: PChar; DestCharCount: SizeUInt;
|
||||
Src: PWideChar; SrcWideCharCount: SizeUInt; Options: TConvertOptions;
|
||||
out ActualCharCount: SizeUInt): TConvertResult;
|
||||
var
|
||||
DestI, SrcI: SizeUInt;
|
||||
W1, W2: Word;
|
||||
C: Cardinal;
|
||||
|
||||
function UnfinishedCharError: Boolean;
|
||||
begin
|
||||
if toUnfinishedCharToSymbol in Options then
|
||||
begin
|
||||
Dest[DestI] := Char('?');
|
||||
Inc(DestI);
|
||||
Result := False;
|
||||
end
|
||||
else
|
||||
if toUnfinishedCharError in Options then
|
||||
begin
|
||||
ConvertUTF16ToUTF8 := trUnfinishedChar;
|
||||
Result := True;
|
||||
end
|
||||
else Result := False;
|
||||
end;
|
||||
|
||||
function InvalidCharError(Count: SizeUInt): Boolean; inline;
|
||||
begin
|
||||
if not (toInvalidCharError in Options) then
|
||||
begin
|
||||
if toInvalidCharToSymbol in Options then
|
||||
begin
|
||||
Dest[DestI] := Char('?');
|
||||
Inc(DestI);
|
||||
end;
|
||||
|
||||
Dec(SrcI, Count);
|
||||
// skip trailing UTF-16 wide char
|
||||
if (Word(Src[SrcI]) and $FC00) = $DC00 then Inc(SrcI);
|
||||
|
||||
Result := False;
|
||||
end
|
||||
else
|
||||
if toInvalidCharError in Options then
|
||||
begin
|
||||
ConvertUTF16ToUTF8 := trUnfinishedChar;
|
||||
Result := True;
|
||||
end;
|
||||
end;
|
||||
|
||||
begin
|
||||
ActualCharCount := 0;
|
||||
|
||||
if not Assigned(Src) then
|
||||
begin
|
||||
Result := trNullSrc;
|
||||
Exit;
|
||||
end;
|
||||
|
||||
if not Assigned(Dest) then
|
||||
begin
|
||||
Result := trNullDest;
|
||||
Exit;
|
||||
end;
|
||||
SrcI := 0;
|
||||
DestI := 0;
|
||||
|
||||
while (DestI < DestCharCount) and (SrcI < SrcWideCharCount) do
|
||||
begin
|
||||
W1 := Word(Src[SrcI]);
|
||||
Inc(SrcI);
|
||||
|
||||
if (W1 < $D800) or (W1 > $DFFF) then // single wide char UTF-16 char
|
||||
begin
|
||||
if W1 < $0080 then // to single byte UTF-8 char
|
||||
begin
|
||||
Dest[DestI] := Char(W1);
|
||||
Inc(DestI);
|
||||
end
|
||||
else
|
||||
if W1 < $0800 then // to double byte UTF-8 char
|
||||
begin
|
||||
Dest[DestI] := Char(%11000000 or ((W1 and %11111000000) shr 6));
|
||||
Inc(DestI);
|
||||
if DestI >= DestCharCount then Break;
|
||||
Dest[DestI] := Char(%10000000 or (W1 and %111111));
|
||||
Inc(DestI);
|
||||
end
|
||||
else
|
||||
begin // to triple byte UTF-8 char
|
||||
Dest[DestI] := Char(%11100000 or ((W1 and %1111000000000000) shr 12));
|
||||
Inc(DestI);
|
||||
if DestI >= DestCharCount then Break;
|
||||
Dest[DestI] := Char(%10000000 or ((W1 and %111111000000) shr 6));
|
||||
Inc(DestI);
|
||||
if DestI >= DestCharCount then Break;
|
||||
Dest[DestI] := Char(%10000000 or (W1 and %111111));
|
||||
Inc(DestI);
|
||||
end;
|
||||
end
|
||||
else
|
||||
begin
|
||||
if SrcI >= SrcWideCharCount then
|
||||
if UnfinishedCharError then Exit(trInvalidChar)
|
||||
else Break;
|
||||
|
||||
W2 := Word(Src[SrcI]);
|
||||
Inc(SrcI);
|
||||
|
||||
if (W1 and $F800) = $D800 then // double wide char UTF-16 char
|
||||
begin
|
||||
if (W2 and $FC00) = $DC00 then
|
||||
begin
|
||||
C := (W1 - $D800) shl 10 + (W2 - $DC00) + $10000;
|
||||
|
||||
// to 4 byte UTF-8 char
|
||||
Dest[DestI] := Char(%11110000 or (C shr 18));
|
||||
Inc(DestI);
|
||||
if DestI >= DestCharCount then Break;
|
||||
Dest[DestI] := Char(%10000000 or ((C and $3F000) shr 12));
|
||||
Inc(DestI);
|
||||
if DestI >= DestCharCount then Break;
|
||||
Dest[DestI] := Char(%10000000 or ((C and %111111000000) shr 6));
|
||||
Inc(DestI);
|
||||
if DestI >= DestCharCount then Break;
|
||||
Dest[DestI] := Char(%10000000 or (C and %111111));
|
||||
Inc(DestI);
|
||||
end
|
||||
else // invalid character, assume single wide char UTF-16 char
|
||||
if InvalidCharError(1) then Exit(trInvalidChar);
|
||||
end
|
||||
else // invalid character, assume single wide char UTF-16 char
|
||||
if InvalidCharError(1) then Exit(trInvalidChar);
|
||||
end;
|
||||
end;
|
||||
|
||||
if DestI >= DestCharCount then
|
||||
begin
|
||||
DestI := DestCharCount - 1;
|
||||
Result := trDestExhausted;
|
||||
end
|
||||
else
|
||||
Result := trNoError;
|
||||
|
||||
Dest[DestI] := #0;
|
||||
ActualCharCount := DestI + 1;
|
||||
Result := LazUTF8.ConvertUTF16ToUTF8(Dest, DestCharCount,
|
||||
Src, SrcWideCharCount, Options, ActualCharCount);
|
||||
end;
|
||||
|
||||
{------------------------------------------------------------------------------
|
||||
@ -3661,21 +2942,8 @@ end;
|
||||
copy
|
||||
------------------------------------------------------------------------------}
|
||||
function UTF8ToUTF16(const S: AnsiString): UTF16String;
|
||||
var
|
||||
L: SizeUInt;
|
||||
begin
|
||||
if S = ''
|
||||
then begin
|
||||
Result := '';
|
||||
Exit;
|
||||
end;
|
||||
|
||||
SetLength(Result, Length(S));
|
||||
// wide chars of UTF-16 <= bytes of UTF-8 string
|
||||
if ConvertUTF8ToUTF16(PWideChar(Result), Length(Result) + 1, PChar(S), Length(S),
|
||||
[toInvalidCharToSymbol], L) = trNoError
|
||||
then SetLength(Result, L - 1)
|
||||
else Result := '';
|
||||
Result := LazUTF8.UTF8ToUTF16(S);
|
||||
end;
|
||||
|
||||
{------------------------------------------------------------------------------
|
||||
@ -3686,74 +2954,13 @@ end;
|
||||
Converts the specified UTF-16 encoded string (system endian) to UTF-8 encoded
|
||||
------------------------------------------------------------------------------}
|
||||
function UTF16ToUTF8(const S: UTF16String): AnsiString;
|
||||
var
|
||||
L: SizeUInt;
|
||||
R: AnsiString;
|
||||
begin
|
||||
Result := '';
|
||||
if S = '' then Exit;
|
||||
|
||||
SetLength(R, Length(S) * 3);
|
||||
// bytes of UTF-8 <= 3 * wide chars of UTF-16 string
|
||||
// e.g. %11100000 10100000 10000000 (UTF-8) is $0800 (UTF-16)
|
||||
if ConvertUTF16ToUTF8(PChar(R), Length(R) + 1, PWideChar(S), Length(S),
|
||||
[toInvalidCharToSymbol], L) = trNoError then
|
||||
begin
|
||||
SetLength(R, L - 1);
|
||||
Result := R;
|
||||
end;
|
||||
Result := LazUTF8.UTF16ToUTF8(S);
|
||||
end;
|
||||
|
||||
procedure LCLGetLanguageIDs(var Lang, FallbackLang: String);
|
||||
|
||||
{$IFDEF DARWIN}
|
||||
function GetLanguage: boolean;
|
||||
var
|
||||
Ref: CFStringRef;
|
||||
LangArray: CFMutableArrayRef;
|
||||
StrSize: CFIndex;
|
||||
StrRange: CFRange;
|
||||
Locals: CFArrayRef;
|
||||
Bundle: CFBundleRef;
|
||||
begin
|
||||
Result := false;
|
||||
Bundle:=CFBundleGetMainBundle;
|
||||
if Bundle=nil then exit;
|
||||
Locals:=CFBundleCopyBundleLocalizations(Bundle);
|
||||
if Locals=nil then exit;
|
||||
LangArray := CFBundleCopyLocalizationsForPreferences(Locals, nil);
|
||||
try
|
||||
if CFArrayGetCount(LangArray) > 0 then
|
||||
begin
|
||||
Ref := CFArrayGetValueAtIndex(LangArray, 0);
|
||||
StrRange.location := 0;
|
||||
StrRange.length := CFStringGetLength(Ref);
|
||||
|
||||
CFStringGetBytes(Ref, StrRange, kCFStringEncodingUTF8,
|
||||
Ord('?'), False, nil, 0, StrSize);
|
||||
SetLength(Lang, StrSize);
|
||||
|
||||
if StrSize > 0 then
|
||||
begin
|
||||
CFStringGetBytes(Ref, StrRange, kCFStringEncodingUTF8,
|
||||
Ord('?'), False, @Lang[1], StrSize, StrSize);
|
||||
Result:=true;
|
||||
FallbackLang := Copy(Lang, 1, 2);
|
||||
end;
|
||||
end;
|
||||
finally
|
||||
CFRelease(LangArray);
|
||||
CFRelease(Locals);
|
||||
end;
|
||||
end;
|
||||
{$ENDIF}
|
||||
begin
|
||||
{$IFDEF DARWIN}
|
||||
if not GetLanguage then
|
||||
GetLanguageIDs(Lang, FallbackLang);
|
||||
{$ELSE}
|
||||
GetLanguageIDs(Lang, FallbackLang);
|
||||
{$ENDIF}
|
||||
LazUTF8.LazGetLanguageIDs(Lang, FallbackLang);
|
||||
end;
|
||||
|
||||
function CreateFirstIdentifier(const Identifier: string): string;
|
||||
|
Loading…
Reference in New Issue
Block a user