mirror of
https://gitlab.com/freepascal.org/lazarus/lazarus.git
synced 2025-08-22 18:19:23 +02:00
First steps of a major move of code from lclproc to lazutf8
git-svn-id: trunk@32986 -
This commit is contained in:
parent
207d40aab6
commit
b3f587b60f
@ -34,17 +34,90 @@ interface
|
||||
uses
|
||||
Classes, SysUtils, lazutf8;
|
||||
|
||||
//function UTF16CharacterLength(p: PWideChar): integer;
|
||||
//function UTF16Length(const s: widestring): PtrInt;
|
||||
//function UTF16Length(p: PWideChar; WordCount: PtrInt): PtrInt;
|
||||
//function UTF16CharacterToUnicode(p: PWideChar; out CharLen: integer): Cardinal;
|
||||
//function UnicodeToUTF16(u: cardinal): widestring;
|
||||
function UTF16CharacterLength(p: PWideChar): integer;
|
||||
function UTF16Length(const s: widestring): PtrInt;
|
||||
function UTF16Length(p: PWideChar; WordCount: PtrInt): PtrInt;
|
||||
function UTF16CharacterToUnicode(p: PWideChar; out CharLen: integer): Cardinal;
|
||||
function UnicodeToUTF16(u: cardinal): widestring;
|
||||
|
||||
function UnicodeLowercase(u: cardinal): cardinal;
|
||||
function UTF8LowerCaseViaTables(const s: utf8string): utf8string;
|
||||
|
||||
implementation
|
||||
|
||||
function UTF16CharacterLength(p: PWideChar): integer;
|
||||
// returns length of UTF16 character in number of words
|
||||
// The endianess of the machine will be taken.
|
||||
begin
|
||||
if p<>nil then begin
|
||||
if (ord(p[0]) < $D800) or (ord(p[0]) > $DFFF) then
|
||||
Result:=1
|
||||
else
|
||||
Result:=2;
|
||||
end else begin
|
||||
Result:=0;
|
||||
end;
|
||||
end;
|
||||
|
||||
function UTF16Length(const s: widestring): PtrInt;
|
||||
begin
|
||||
Result:=UTF16Length(PWideChar(s),length(s));
|
||||
end;
|
||||
|
||||
function UTF16Length(p: PWideChar; WordCount: PtrInt): PtrInt;
|
||||
var
|
||||
CharLen: LongInt;
|
||||
begin
|
||||
Result:=0;
|
||||
while (WordCount>0) do begin
|
||||
inc(Result);
|
||||
CharLen:=UTF16CharacterLength(p);
|
||||
inc(p,CharLen);
|
||||
dec(WordCount,CharLen);
|
||||
end;
|
||||
end;
|
||||
|
||||
function UTF16CharacterToUnicode(p: PWideChar; out CharLen: integer): Cardinal;
|
||||
var
|
||||
w1: cardinal;
|
||||
w2: Cardinal;
|
||||
begin
|
||||
if p<>nil then begin
|
||||
w1:=ord(p[0]);
|
||||
if (w1 < $D800) or (w1 > $DFFF) then begin
|
||||
// is 1 word character
|
||||
Result:=w1;
|
||||
CharLen:=1;
|
||||
end else begin
|
||||
// could be 2 word character
|
||||
w2:=ord(p[1]);
|
||||
if (w2>=$DC00) then begin
|
||||
// is 2 word character
|
||||
Result:=(w1-$D800) shl 10 + (w2-$DC00) + $10000;
|
||||
CharLen:=2;
|
||||
end else begin
|
||||
// invalid character
|
||||
Result:=w1;
|
||||
CharLen:=1;
|
||||
end;
|
||||
end;
|
||||
end else begin
|
||||
Result:=0;
|
||||
CharLen:=0;
|
||||
end;
|
||||
end;
|
||||
|
||||
function UnicodeToUTF16(u: cardinal): widestring;
|
||||
begin
|
||||
// u should be <= $10FFFF to fit into UTF-16
|
||||
|
||||
if u < $10000 then
|
||||
// Note: codepoints $D800 - $DFFF are reserved
|
||||
Result:=system.widechar(u)
|
||||
else
|
||||
Result:=system.widechar($D800+((u - $10000) shr 10))+system.widechar($DC00+((u - $10000) and $3ff));
|
||||
end;
|
||||
|
||||
// Lowercase Unicode Tables which match UTF-16 but also UTF-32
|
||||
var
|
||||
UnicodeLower00C0_00DE: array[$00C0..$00DE] of word;
|
||||
|
@ -45,7 +45,7 @@ function UTF8CharacterToUnicode(p: PChar; out CharLen: integer): Cardinal;
|
||||
function UnicodeToUTF8(u: cardinal; Buf: PChar): integer; inline;
|
||||
function UnicodeToUTF8SkipErrors(u: cardinal; Buf: PChar): integer;
|
||||
function UnicodeToUTF8(u: cardinal): shortstring; inline;
|
||||
{function UTF8ToDoubleByteString(const s: string): string;
|
||||
function UTF8ToDoubleByteString(const s: string): string;
|
||||
function UTF8ToDoubleByte(UTF8Str: PChar; Len: PtrInt; DBStr: PByte): PtrInt;
|
||||
function UTF8FindNearestCharStart(UTF8Str: PChar; Len: integer;
|
||||
BytePos: integer): integer;
|
||||
@ -53,23 +53,21 @@ function UTF8FindNearestCharStart(UTF8Str: PChar; Len: integer;
|
||||
function UTF8CharStart(UTF8Str: PChar; Len, CharIndex: PtrInt): PChar;
|
||||
// find the byte index of the n-th UTF8 character, ignoring BIDI (byte len of substr)
|
||||
function UTF8CharToByteIndex(UTF8Str: PChar; Len, CharIndex: PtrInt): PtrInt;
|
||||
procedure UTF8FixBroken(P: PChar);}
|
||||
procedure UTF8FixBroken(P: PChar);
|
||||
function UTF8CharacterStrictLength(P: PChar): integer;
|
||||
{function UTF8CStringToUTF8String(SourceStart: PChar; SourceLen: PtrInt) : string;
|
||||
function UTF8CStringToUTF8String(SourceStart: PChar; SourceLen: PtrInt) : string;
|
||||
function UTF8Pos(const SearchForText, SearchInText: string): PtrInt;
|
||||
function UTF8Copy(const s: string; StartCharIndex, CharCount: PtrInt): string;
|
||||
procedure UTF8Delete(var s: String; StartCharIndex, CharCount: PtrInt);
|
||||
procedure UTF8Insert(const source: String; var s: string; StartCharIndex: PtrInt);}
|
||||
procedure UTF8Insert(const source: String; var s: string; StartCharIndex: PtrInt);
|
||||
|
||||
function UTF8LowerCase(const AInStr: utf8string; ALanguage: utf8string=''): utf8string;
|
||||
function UTF8UpperCase(const AInStr: utf8string; ALanguage: utf8string=''): utf8string;
|
||||
{function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
|
||||
// StopOnNonASCII: Boolean = false): PtrInt;
|
||||
//function ValidUTF8String(const s: String): String;
|
||||
function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
|
||||
StopOnNonASCII: Boolean = false): PtrInt;
|
||||
function ValidUTF8String(const s: String): String;
|
||||
|
||||
//procedure AssignUTF8ListToAnsi(UTF8List, AnsiList: TStrings);
|
||||
|
||||
}
|
||||
procedure AssignUTF8ListToAnsi(UTF8List, AnsiList: TStrings);
|
||||
|
||||
//compare functions
|
||||
|
||||
@ -347,6 +345,156 @@ begin
|
||||
Result[0]:=chr(UnicodeToUTF8(u,@Result[1]));
|
||||
end;
|
||||
|
||||
function UTF8ToDoubleByteString(const s: string): string;
|
||||
var
|
||||
Len: Integer;
|
||||
begin
|
||||
Len:=UTF8Length(s);
|
||||
SetLength(Result,Len*2);
|
||||
if Len=0 then exit;
|
||||
UTF8ToDoubleByte(PChar(s),length(s),PByte(Result));
|
||||
end;
|
||||
|
||||
{ returns number of double bytes }
|
||||
function UTF8ToDoubleByte(UTF8Str: PChar; Len: PtrInt; DBStr: PByte): PtrInt;
|
||||
var
|
||||
SrcPos: PChar;
|
||||
CharLen: LongInt;
|
||||
DestPos: PByte;
|
||||
u: Cardinal;
|
||||
begin
|
||||
SrcPos:=UTF8Str;
|
||||
DestPos:=DBStr;
|
||||
Result:=0;
|
||||
while Len>0 do begin
|
||||
u:=UTF8CharacterToUnicode(SrcPos,CharLen);
|
||||
DestPos^:=byte((u shr 8) and $ff);
|
||||
inc(DestPos);
|
||||
DestPos^:=byte(u and $ff);
|
||||
inc(DestPos);
|
||||
inc(SrcPos,CharLen);
|
||||
dec(Len,CharLen);
|
||||
inc(Result);
|
||||
end;
|
||||
end;
|
||||
|
||||
{ Find the start of the UTF8 character which contains BytePos,
|
||||
Len is length in byte, BytePos starts at 0 }
|
||||
function UTF8FindNearestCharStart(UTF8Str: PChar; Len: integer;
|
||||
BytePos: integer): integer;
|
||||
begin
|
||||
Result:=0;
|
||||
if (UTF8Str<>nil) and (Len>0) and (BytePos>=0) then begin
|
||||
Result:=BytePos;
|
||||
if Result>Len then Result:=Len-1;
|
||||
if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
|
||||
dec(Result);
|
||||
if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
|
||||
dec(Result);
|
||||
if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
|
||||
dec(Result);
|
||||
// should be four byte character
|
||||
if (ord(UTF8Str[Result]) and %11111000<>%11110000) then begin
|
||||
// broken UTF8 character
|
||||
inc(Result,3);
|
||||
end else begin
|
||||
// is four byte character
|
||||
end;
|
||||
end else if (ord(UTF8Str[Result]) and %11110000<>%11100000) then begin
|
||||
// broken UTF8 character, should be three byte
|
||||
inc(Result,2);
|
||||
end else
|
||||
begin
|
||||
// is three byte character
|
||||
end;
|
||||
end else if (ord(UTF8Str[Result]) and %11100000<>%11000000) then begin
|
||||
// broken UTF8 character, should be two byte
|
||||
inc(Result);
|
||||
end else
|
||||
begin
|
||||
// is two byte character
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
|
||||
{ Len is the length in bytes of UTF8Str
|
||||
CharIndex is the position of the desired char (starting at 0), in chars
|
||||
|
||||
This function is similar to UTF8FindNearestCharStart
|
||||
}
|
||||
function UTF8CharStart(UTF8Str: PChar; Len, CharIndex: PtrInt): PChar;
|
||||
var
|
||||
CharLen: LongInt;
|
||||
begin
|
||||
Result:=UTF8Str;
|
||||
if Result<>nil then begin
|
||||
while (CharIndex>0) and (Len>0) do begin
|
||||
CharLen:=UTF8CharacterLength(Result);
|
||||
dec(Len,CharLen);
|
||||
dec(CharIndex);
|
||||
inc(Result,CharLen);
|
||||
end;
|
||||
if (CharIndex>0) or (Len<0) then
|
||||
Result:=nil;
|
||||
end;
|
||||
end;
|
||||
|
||||
function UTF8CharToByteIndex(UTF8Str: PChar; Len, CharIndex: PtrInt): PtrInt;
|
||||
var
|
||||
p: PChar;
|
||||
begin
|
||||
p := UTF8CharStart(UTF8Str, Len, CharIndex);
|
||||
if p = nil
|
||||
then Result := -1
|
||||
else Result := p - UTF8Str;
|
||||
end;
|
||||
|
||||
{ fix any broken UTF8 sequences with spaces }
|
||||
procedure UTF8FixBroken(P: PChar);
|
||||
begin
|
||||
if p=nil then exit;
|
||||
while p^<>#0 do begin
|
||||
if ord(p^)<%10000000 then begin
|
||||
// regular single byte character
|
||||
inc(p);
|
||||
end
|
||||
else if ord(p^)<%11000000 then begin
|
||||
// invalid
|
||||
p^:=' ';
|
||||
inc(p);
|
||||
end
|
||||
else if ((ord(p^) and %11100000) = %11000000) then begin
|
||||
// should be 2 byte character
|
||||
if (ord(p[1]) and %11000000) = %10000000 then
|
||||
inc(p,2)
|
||||
else if p[1]<>#0 then
|
||||
p^:=' ';
|
||||
end
|
||||
else if ((ord(p^) and %11110000) = %11100000) then begin
|
||||
// should be 3 byte character
|
||||
if ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000) then
|
||||
inc(p,3)
|
||||
else
|
||||
p^:=' ';
|
||||
end
|
||||
else if ((ord(p^) and %11111000) = %11110000) then begin
|
||||
// should be 4 byte character
|
||||
if ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000)
|
||||
and ((ord(p[3]) and %11000000) = %10000000) then
|
||||
inc(p,4)
|
||||
else
|
||||
p^:=' ';
|
||||
end
|
||||
else begin
|
||||
p^:=' ';
|
||||
inc(p);
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
|
||||
function UTF8CharacterStrictLength(P: PChar): integer;
|
||||
begin
|
||||
if p=nil then exit(0);
|
||||
@ -385,6 +533,127 @@ begin
|
||||
exit(0);
|
||||
end;
|
||||
|
||||
function UTF8CStringToUTF8String(SourceStart: PChar; SourceLen: PtrInt) : string;
|
||||
var
|
||||
Source: PChar;
|
||||
Dest: PChar;
|
||||
SourceEnd: PChar;
|
||||
CharLen: integer;
|
||||
SourceCopied: PChar;
|
||||
|
||||
// Copies from SourceStart till Source to Dest and updates Dest
|
||||
procedure CopyPart; inline;
|
||||
var
|
||||
CopyLength: SizeInt;
|
||||
begin
|
||||
CopyLength := Source - SourceCopied;
|
||||
if CopyLength=0 then exit;
|
||||
System.move(SourceCopied^ , Dest^, CopyLength);
|
||||
SourceCopied:=Source;
|
||||
inc(Dest, CopyLength);
|
||||
end;
|
||||
|
||||
begin
|
||||
SetLength(Result, SourceLen);
|
||||
if SourceLen=0 then exit;
|
||||
SourceCopied:=SourceStart;
|
||||
Source:=SourceStart;
|
||||
Dest:=PChar(Result);
|
||||
SourceEnd := Source + SourceLen;
|
||||
while Source<SourceEnd do begin
|
||||
CharLen := UTF8CharacterLength(Source);
|
||||
if (CharLen=1) and (Source^='\') then begin
|
||||
CopyPart;
|
||||
inc(Source);
|
||||
if Source^ in ['t', 'n', '"', '\'] then begin
|
||||
case Source^ of
|
||||
't' : Dest^ := #9;
|
||||
'"' : Dest^ := '"';
|
||||
'\' : Dest^ := '\';
|
||||
'n' :
|
||||
// fpc 2.1.1 stores string constants as array of char so maybe this
|
||||
// will work for without ifdef (once available in 2.0.x too):
|
||||
// move(lineending, dest^, sizeof(LineEnding));
|
||||
{$IFDEF WINDOWS}
|
||||
begin
|
||||
move(lineending[1], dest^, length(LineEnding));
|
||||
inc(dest, length(LineEnding)-1);
|
||||
end;
|
||||
{$ELSE}
|
||||
Dest^ := LineEnding;
|
||||
{$ENDIF}
|
||||
end;
|
||||
inc(Source);
|
||||
inc(Dest);
|
||||
end;
|
||||
SourceCopied := Source;
|
||||
end
|
||||
else
|
||||
Inc(Source, CharLen);
|
||||
end;
|
||||
CopyPart;
|
||||
SetLength(Result, Dest - PChar(Result));
|
||||
end;
|
||||
|
||||
function UTF8Pos(const SearchForText, SearchInText: string): PtrInt;
|
||||
// returns the character index, where the SearchForText starts in SearchInText
|
||||
var
|
||||
p: LongInt;
|
||||
begin
|
||||
p:=System.Pos(SearchForText,SearchInText);
|
||||
if p>0 then
|
||||
Result:=UTF8Length(PChar(SearchInText),p-1)+1
|
||||
else
|
||||
Result:=0;
|
||||
end;
|
||||
|
||||
function UTF8Copy(const s: string; StartCharIndex, CharCount: PtrInt): string;
|
||||
// returns substring
|
||||
var
|
||||
StartBytePos: PChar;
|
||||
EndBytePos: PChar;
|
||||
MaxBytes: PtrInt;
|
||||
begin
|
||||
StartBytePos:=UTF8CharStart(PChar(s),length(s),StartCharIndex-1);
|
||||
if StartBytePos=nil then
|
||||
Result:=''
|
||||
else begin
|
||||
MaxBytes:=PtrInt(PChar(s)+length(s)-StartBytePos);
|
||||
EndBytePos:=UTF8CharStart(StartBytePos,MaxBytes,CharCount);
|
||||
if EndBytePos=nil then
|
||||
Result:=copy(s,StartBytePos-PChar(s)+1,MaxBytes)
|
||||
else
|
||||
Result:=copy(s,StartBytePos-PChar(s)+1,EndBytePos-StartBytePos);
|
||||
end;
|
||||
end;
|
||||
|
||||
procedure UTF8Delete(var s: String; StartCharIndex, CharCount: PtrInt);
|
||||
var
|
||||
StartBytePos: PChar;
|
||||
EndBytePos: PChar;
|
||||
MaxBytes: PtrInt;
|
||||
begin
|
||||
StartBytePos:=UTF8CharStart(PChar(s),length(s),StartCharIndex-1);
|
||||
if StartBytePos <> nil then
|
||||
begin
|
||||
MaxBytes:=PtrInt(PChar(s)+length(s)-StartBytePos);
|
||||
EndBytePos:=UTF8CharStart(StartBytePos,MaxBytes,CharCount);
|
||||
if EndBytePos=nil then
|
||||
Delete(s,StartBytePos-PChar(s)+1,MaxBytes)
|
||||
else
|
||||
Delete(s,StartBytePos-PChar(s)+1,EndBytePos-StartBytePos);
|
||||
end;
|
||||
end;
|
||||
|
||||
procedure UTF8Insert(const source: String; var s: string; StartCharIndex: PtrInt);
|
||||
var
|
||||
StartBytePos: PChar;
|
||||
begin
|
||||
StartBytePos:=UTF8CharStart(PChar(s),length(s),StartCharIndex-1);
|
||||
if StartBytePos <> nil then
|
||||
Insert(source, s, StartBytePos-PChar(s)+1);
|
||||
end;
|
||||
|
||||
{
|
||||
AInStr - The input string
|
||||
ALanguage - The language. Use '' for maximum speed if one desires to ignore the language
|
||||
@ -1884,6 +2153,115 @@ begin
|
||||
SetLength(Result,OutCounter);
|
||||
end;
|
||||
|
||||
function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
|
||||
StopOnNonASCII: Boolean): PtrInt;
|
||||
// return -1 if ok
|
||||
var
|
||||
CharLen: Integer;
|
||||
begin
|
||||
if (p<>nil) then begin
|
||||
Result:=0;
|
||||
while Result<Count do begin
|
||||
if ord(p^)<128 then begin
|
||||
// regular single byte ASCII character (#0 is a character, this is pascal ;)
|
||||
CharLen:=1;
|
||||
end
|
||||
else if ord(p^)<%11000000 then begin
|
||||
// regular single byte character
|
||||
if StopOnNonASCII then
|
||||
exit;
|
||||
CharLen:=1;
|
||||
end
|
||||
else if ((ord(p^) and %11100000) = %11000000) then begin
|
||||
// could be 2 byte character
|
||||
if (Result<Count-1) and ((ord(p[1]) and %11000000) = %10000000) then
|
||||
CharLen:=2
|
||||
else
|
||||
exit; // missing following bytes
|
||||
end
|
||||
else if ((ord(p^) and %11110000) = %11100000) then begin
|
||||
// could be 3 byte character
|
||||
if (Result<Count-2) and ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000) then
|
||||
CharLen:=3
|
||||
else
|
||||
exit; // missing following bytes
|
||||
end
|
||||
else if ((ord(p^) and %11111000) = %11110000) then begin
|
||||
// could be 4 byte character
|
||||
if (Result<Count-3) and ((ord(p[1]) and %11000000) = %10000000)
|
||||
and ((ord(p[2]) and %11000000) = %10000000)
|
||||
and ((ord(p[3]) and %11000000) = %10000000) then
|
||||
CharLen:=4
|
||||
else
|
||||
exit; // missing following bytes
|
||||
end
|
||||
else begin
|
||||
if StopOnNonASCII then
|
||||
exit;
|
||||
CharLen:=1;
|
||||
end;
|
||||
inc(Result,CharLen);
|
||||
inc(p,CharLen);
|
||||
if Result>Count then begin
|
||||
dec(Result,CharLen);
|
||||
exit; // missing following bytes
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
// ok
|
||||
Result:=-1;
|
||||
end;
|
||||
|
||||
function ValidUTF8String(const s: String): String;
|
||||
var
|
||||
p, cur: PChar;
|
||||
l, lr: integer;
|
||||
NeedFree: Boolean;
|
||||
begin
|
||||
if FindInvalidUTF8Character(PChar(s), Length(s)) <> -1 then
|
||||
begin
|
||||
NeedFree := True;
|
||||
GetMem(p, Length(s) + 1);
|
||||
StrPCopy(p, s);
|
||||
UTF8FixBroken(p);
|
||||
end
|
||||
else
|
||||
begin
|
||||
p := PChar(s);
|
||||
NeedFree := False;
|
||||
end;
|
||||
|
||||
Result := '';
|
||||
cur := p;
|
||||
while cur^ <> #0 do
|
||||
begin
|
||||
l := UTF8CharacterLength(cur);
|
||||
if (l = 1) and (cur^ < #32) then
|
||||
Result := Result + '#' + IntToStr(Ord(cur^))
|
||||
else
|
||||
begin
|
||||
lr := Length(Result);
|
||||
SetLength(Result, lr + l);
|
||||
System.Move(cur^, Result[lr + 1], l);
|
||||
end;
|
||||
inc(cur, l)
|
||||
end;
|
||||
|
||||
if NeedFree then
|
||||
FreeMem(p);
|
||||
end;
|
||||
|
||||
procedure AssignUTF8ListToAnsi(UTF8List, AnsiList: TStrings);
|
||||
var
|
||||
i: Integer;
|
||||
begin
|
||||
AnsiList.Clear;
|
||||
if UTF8List=nil then exit;
|
||||
for i:=0 to UTF8List.Count-1 do
|
||||
AnsiList.Add(UTF8ToSys(UTF8List[i]));
|
||||
end;
|
||||
|
||||
{------------------------------------------------------------------------------
|
||||
Name: UTF8CompareStr
|
||||
Params: S1, S2 - UTF8 encoded strings
|
||||
|
Loading…
Reference in New Issue
Block a user