First steps of a major move of code from lclproc to lazutf8

git-svn-id: trunk@32986 -
This commit is contained in:
sekelsenmat 2011-10-19 19:16:31 +00:00
parent 207d40aab6
commit b3f587b60f
2 changed files with 466 additions and 15 deletions

View File

@ -34,17 +34,90 @@ interface
uses
Classes, SysUtils, lazutf8;
//function UTF16CharacterLength(p: PWideChar): integer;
//function UTF16Length(const s: widestring): PtrInt;
//function UTF16Length(p: PWideChar; WordCount: PtrInt): PtrInt;
//function UTF16CharacterToUnicode(p: PWideChar; out CharLen: integer): Cardinal;
//function UnicodeToUTF16(u: cardinal): widestring;
function UTF16CharacterLength(p: PWideChar): integer;
function UTF16Length(const s: widestring): PtrInt;
function UTF16Length(p: PWideChar; WordCount: PtrInt): PtrInt;
function UTF16CharacterToUnicode(p: PWideChar; out CharLen: integer): Cardinal;
function UnicodeToUTF16(u: cardinal): widestring;
function UnicodeLowercase(u: cardinal): cardinal;
function UTF8LowerCaseViaTables(const s: utf8string): utf8string;
implementation
function UTF16CharacterLength(p: PWideChar): integer;
// returns length of UTF16 character in number of words
// The endianess of the machine will be taken.
begin
if p<>nil then begin
if (ord(p[0]) < $D800) or (ord(p[0]) > $DFFF) then
Result:=1
else
Result:=2;
end else begin
Result:=0;
end;
end;
function UTF16Length(const s: widestring): PtrInt;
begin
Result:=UTF16Length(PWideChar(s),length(s));
end;
function UTF16Length(p: PWideChar; WordCount: PtrInt): PtrInt;
var
CharLen: LongInt;
begin
Result:=0;
while (WordCount>0) do begin
inc(Result);
CharLen:=UTF16CharacterLength(p);
inc(p,CharLen);
dec(WordCount,CharLen);
end;
end;
function UTF16CharacterToUnicode(p: PWideChar; out CharLen: integer): Cardinal;
var
w1: cardinal;
w2: Cardinal;
begin
if p<>nil then begin
w1:=ord(p[0]);
if (w1 < $D800) or (w1 > $DFFF) then begin
// is 1 word character
Result:=w1;
CharLen:=1;
end else begin
// could be 2 word character
w2:=ord(p[1]);
if (w2>=$DC00) then begin
// is 2 word character
Result:=(w1-$D800) shl 10 + (w2-$DC00) + $10000;
CharLen:=2;
end else begin
// invalid character
Result:=w1;
CharLen:=1;
end;
end;
end else begin
Result:=0;
CharLen:=0;
end;
end;
function UnicodeToUTF16(u: cardinal): widestring;
begin
// u should be <= $10FFFF to fit into UTF-16
if u < $10000 then
// Note: codepoints $D800 - $DFFF are reserved
Result:=system.widechar(u)
else
Result:=system.widechar($D800+((u - $10000) shr 10))+system.widechar($DC00+((u - $10000) and $3ff));
end;
// Lowercase Unicode Tables which match UTF-16 but also UTF-32
var
UnicodeLower00C0_00DE: array[$00C0..$00DE] of word;

View File

@ -45,7 +45,7 @@ function UTF8CharacterToUnicode(p: PChar; out CharLen: integer): Cardinal;
function UnicodeToUTF8(u: cardinal; Buf: PChar): integer; inline;
function UnicodeToUTF8SkipErrors(u: cardinal; Buf: PChar): integer;
function UnicodeToUTF8(u: cardinal): shortstring; inline;
{function UTF8ToDoubleByteString(const s: string): string;
function UTF8ToDoubleByteString(const s: string): string;
function UTF8ToDoubleByte(UTF8Str: PChar; Len: PtrInt; DBStr: PByte): PtrInt;
function UTF8FindNearestCharStart(UTF8Str: PChar; Len: integer;
BytePos: integer): integer;
@ -53,23 +53,21 @@ function UTF8FindNearestCharStart(UTF8Str: PChar; Len: integer;
function UTF8CharStart(UTF8Str: PChar; Len, CharIndex: PtrInt): PChar;
// find the byte index of the n-th UTF8 character, ignoring BIDI (byte len of substr)
function UTF8CharToByteIndex(UTF8Str: PChar; Len, CharIndex: PtrInt): PtrInt;
procedure UTF8FixBroken(P: PChar);}
procedure UTF8FixBroken(P: PChar);
function UTF8CharacterStrictLength(P: PChar): integer;
{function UTF8CStringToUTF8String(SourceStart: PChar; SourceLen: PtrInt) : string;
function UTF8CStringToUTF8String(SourceStart: PChar; SourceLen: PtrInt) : string;
function UTF8Pos(const SearchForText, SearchInText: string): PtrInt;
function UTF8Copy(const s: string; StartCharIndex, CharCount: PtrInt): string;
procedure UTF8Delete(var s: String; StartCharIndex, CharCount: PtrInt);
procedure UTF8Insert(const source: String; var s: string; StartCharIndex: PtrInt);}
procedure UTF8Insert(const source: String; var s: string; StartCharIndex: PtrInt);
function UTF8LowerCase(const AInStr: utf8string; ALanguage: utf8string=''): utf8string;
function UTF8UpperCase(const AInStr: utf8string; ALanguage: utf8string=''): utf8string;
{function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
// StopOnNonASCII: Boolean = false): PtrInt;
//function ValidUTF8String(const s: String): String;
function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
StopOnNonASCII: Boolean = false): PtrInt;
function ValidUTF8String(const s: String): String;
//procedure AssignUTF8ListToAnsi(UTF8List, AnsiList: TStrings);
}
procedure AssignUTF8ListToAnsi(UTF8List, AnsiList: TStrings);
//compare functions
@ -347,6 +345,156 @@ begin
Result[0]:=chr(UnicodeToUTF8(u,@Result[1]));
end;
function UTF8ToDoubleByteString(const s: string): string;
var
Len: Integer;
begin
Len:=UTF8Length(s);
SetLength(Result,Len*2);
if Len=0 then exit;
UTF8ToDoubleByte(PChar(s),length(s),PByte(Result));
end;
{ returns number of double bytes }
function UTF8ToDoubleByte(UTF8Str: PChar; Len: PtrInt; DBStr: PByte): PtrInt;
var
SrcPos: PChar;
CharLen: LongInt;
DestPos: PByte;
u: Cardinal;
begin
SrcPos:=UTF8Str;
DestPos:=DBStr;
Result:=0;
while Len>0 do begin
u:=UTF8CharacterToUnicode(SrcPos,CharLen);
DestPos^:=byte((u shr 8) and $ff);
inc(DestPos);
DestPos^:=byte(u and $ff);
inc(DestPos);
inc(SrcPos,CharLen);
dec(Len,CharLen);
inc(Result);
end;
end;
{ Find the start of the UTF8 character which contains BytePos,
Len is length in byte, BytePos starts at 0 }
function UTF8FindNearestCharStart(UTF8Str: PChar; Len: integer;
BytePos: integer): integer;
begin
Result:=0;
if (UTF8Str<>nil) and (Len>0) and (BytePos>=0) then begin
Result:=BytePos;
if Result>Len then Result:=Len-1;
if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
dec(Result);
if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
dec(Result);
if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
dec(Result);
// should be four byte character
if (ord(UTF8Str[Result]) and %11111000<>%11110000) then begin
// broken UTF8 character
inc(Result,3);
end else begin
// is four byte character
end;
end else if (ord(UTF8Str[Result]) and %11110000<>%11100000) then begin
// broken UTF8 character, should be three byte
inc(Result,2);
end else
begin
// is three byte character
end;
end else if (ord(UTF8Str[Result]) and %11100000<>%11000000) then begin
// broken UTF8 character, should be two byte
inc(Result);
end else
begin
// is two byte character
end;
end;
end;
end;
{ Len is the length in bytes of UTF8Str
CharIndex is the position of the desired char (starting at 0), in chars
This function is similar to UTF8FindNearestCharStart
}
function UTF8CharStart(UTF8Str: PChar; Len, CharIndex: PtrInt): PChar;
var
CharLen: LongInt;
begin
Result:=UTF8Str;
if Result<>nil then begin
while (CharIndex>0) and (Len>0) do begin
CharLen:=UTF8CharacterLength(Result);
dec(Len,CharLen);
dec(CharIndex);
inc(Result,CharLen);
end;
if (CharIndex>0) or (Len<0) then
Result:=nil;
end;
end;
function UTF8CharToByteIndex(UTF8Str: PChar; Len, CharIndex: PtrInt): PtrInt;
var
p: PChar;
begin
p := UTF8CharStart(UTF8Str, Len, CharIndex);
if p = nil
then Result := -1
else Result := p - UTF8Str;
end;
{ fix any broken UTF8 sequences with spaces }
procedure UTF8FixBroken(P: PChar);
begin
if p=nil then exit;
while p^<>#0 do begin
if ord(p^)<%10000000 then begin
// regular single byte character
inc(p);
end
else if ord(p^)<%11000000 then begin
// invalid
p^:=' ';
inc(p);
end
else if ((ord(p^) and %11100000) = %11000000) then begin
// should be 2 byte character
if (ord(p[1]) and %11000000) = %10000000 then
inc(p,2)
else if p[1]<>#0 then
p^:=' ';
end
else if ((ord(p^) and %11110000) = %11100000) then begin
// should be 3 byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000) then
inc(p,3)
else
p^:=' ';
end
else if ((ord(p^) and %11111000) = %11110000) then begin
// should be 4 byte character
if ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000)
and ((ord(p[3]) and %11000000) = %10000000) then
inc(p,4)
else
p^:=' ';
end
else begin
p^:=' ';
inc(p);
end;
end;
end;
function UTF8CharacterStrictLength(P: PChar): integer;
begin
if p=nil then exit(0);
@ -385,6 +533,127 @@ begin
exit(0);
end;
function UTF8CStringToUTF8String(SourceStart: PChar; SourceLen: PtrInt) : string;
var
Source: PChar;
Dest: PChar;
SourceEnd: PChar;
CharLen: integer;
SourceCopied: PChar;
// Copies from SourceStart till Source to Dest and updates Dest
procedure CopyPart; inline;
var
CopyLength: SizeInt;
begin
CopyLength := Source - SourceCopied;
if CopyLength=0 then exit;
System.move(SourceCopied^ , Dest^, CopyLength);
SourceCopied:=Source;
inc(Dest, CopyLength);
end;
begin
SetLength(Result, SourceLen);
if SourceLen=0 then exit;
SourceCopied:=SourceStart;
Source:=SourceStart;
Dest:=PChar(Result);
SourceEnd := Source + SourceLen;
while Source<SourceEnd do begin
CharLen := UTF8CharacterLength(Source);
if (CharLen=1) and (Source^='\') then begin
CopyPart;
inc(Source);
if Source^ in ['t', 'n', '"', '\'] then begin
case Source^ of
't' : Dest^ := #9;
'"' : Dest^ := '"';
'\' : Dest^ := '\';
'n' :
// fpc 2.1.1 stores string constants as array of char so maybe this
// will work for without ifdef (once available in 2.0.x too):
// move(lineending, dest^, sizeof(LineEnding));
{$IFDEF WINDOWS}
begin
move(lineending[1], dest^, length(LineEnding));
inc(dest, length(LineEnding)-1);
end;
{$ELSE}
Dest^ := LineEnding;
{$ENDIF}
end;
inc(Source);
inc(Dest);
end;
SourceCopied := Source;
end
else
Inc(Source, CharLen);
end;
CopyPart;
SetLength(Result, Dest - PChar(Result));
end;
function UTF8Pos(const SearchForText, SearchInText: string): PtrInt;
// returns the character index, where the SearchForText starts in SearchInText
var
p: LongInt;
begin
p:=System.Pos(SearchForText,SearchInText);
if p>0 then
Result:=UTF8Length(PChar(SearchInText),p-1)+1
else
Result:=0;
end;
function UTF8Copy(const s: string; StartCharIndex, CharCount: PtrInt): string;
// returns substring
var
StartBytePos: PChar;
EndBytePos: PChar;
MaxBytes: PtrInt;
begin
StartBytePos:=UTF8CharStart(PChar(s),length(s),StartCharIndex-1);
if StartBytePos=nil then
Result:=''
else begin
MaxBytes:=PtrInt(PChar(s)+length(s)-StartBytePos);
EndBytePos:=UTF8CharStart(StartBytePos,MaxBytes,CharCount);
if EndBytePos=nil then
Result:=copy(s,StartBytePos-PChar(s)+1,MaxBytes)
else
Result:=copy(s,StartBytePos-PChar(s)+1,EndBytePos-StartBytePos);
end;
end;
procedure UTF8Delete(var s: String; StartCharIndex, CharCount: PtrInt);
var
StartBytePos: PChar;
EndBytePos: PChar;
MaxBytes: PtrInt;
begin
StartBytePos:=UTF8CharStart(PChar(s),length(s),StartCharIndex-1);
if StartBytePos <> nil then
begin
MaxBytes:=PtrInt(PChar(s)+length(s)-StartBytePos);
EndBytePos:=UTF8CharStart(StartBytePos,MaxBytes,CharCount);
if EndBytePos=nil then
Delete(s,StartBytePos-PChar(s)+1,MaxBytes)
else
Delete(s,StartBytePos-PChar(s)+1,EndBytePos-StartBytePos);
end;
end;
procedure UTF8Insert(const source: String; var s: string; StartCharIndex: PtrInt);
var
StartBytePos: PChar;
begin
StartBytePos:=UTF8CharStart(PChar(s),length(s),StartCharIndex-1);
if StartBytePos <> nil then
Insert(source, s, StartBytePos-PChar(s)+1);
end;
{
AInStr - The input string
ALanguage - The language. Use '' for maximum speed if one desires to ignore the language
@ -1884,6 +2153,115 @@ begin
SetLength(Result,OutCounter);
end;
function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
StopOnNonASCII: Boolean): PtrInt;
// return -1 if ok
var
CharLen: Integer;
begin
if (p<>nil) then begin
Result:=0;
while Result<Count do begin
if ord(p^)<128 then begin
// regular single byte ASCII character (#0 is a character, this is pascal ;)
CharLen:=1;
end
else if ord(p^)<%11000000 then begin
// regular single byte character
if StopOnNonASCII then
exit;
CharLen:=1;
end
else if ((ord(p^) and %11100000) = %11000000) then begin
// could be 2 byte character
if (Result<Count-1) and ((ord(p[1]) and %11000000) = %10000000) then
CharLen:=2
else
exit; // missing following bytes
end
else if ((ord(p^) and %11110000) = %11100000) then begin
// could be 3 byte character
if (Result<Count-2) and ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000) then
CharLen:=3
else
exit; // missing following bytes
end
else if ((ord(p^) and %11111000) = %11110000) then begin
// could be 4 byte character
if (Result<Count-3) and ((ord(p[1]) and %11000000) = %10000000)
and ((ord(p[2]) and %11000000) = %10000000)
and ((ord(p[3]) and %11000000) = %10000000) then
CharLen:=4
else
exit; // missing following bytes
end
else begin
if StopOnNonASCII then
exit;
CharLen:=1;
end;
inc(Result,CharLen);
inc(p,CharLen);
if Result>Count then begin
dec(Result,CharLen);
exit; // missing following bytes
end;
end;
end;
// ok
Result:=-1;
end;
function ValidUTF8String(const s: String): String;
var
p, cur: PChar;
l, lr: integer;
NeedFree: Boolean;
begin
if FindInvalidUTF8Character(PChar(s), Length(s)) <> -1 then
begin
NeedFree := True;
GetMem(p, Length(s) + 1);
StrPCopy(p, s);
UTF8FixBroken(p);
end
else
begin
p := PChar(s);
NeedFree := False;
end;
Result := '';
cur := p;
while cur^ <> #0 do
begin
l := UTF8CharacterLength(cur);
if (l = 1) and (cur^ < #32) then
Result := Result + '#' + IntToStr(Ord(cur^))
else
begin
lr := Length(Result);
SetLength(Result, lr + l);
System.Move(cur^, Result[lr + 1], l);
end;
inc(cur, l)
end;
if NeedFree then
FreeMem(p);
end;
procedure AssignUTF8ListToAnsi(UTF8List, AnsiList: TStrings);
var
i: Integer;
begin
AnsiList.Clear;
if UTF8List=nil then exit;
for i:=0 to UTF8List.Count-1 do
AnsiList.Add(UTF8ToSys(UTF8List[i]));
end;
{------------------------------------------------------------------------------
Name: UTF8CompareStr
Params: S1, S2 - UTF8 encoded strings