From e94508d5ebb06a85b7e459d3ed420b5d19f6ca45 Mon Sep 17 00:00:00 2001 From: sergei Date: Mon, 6 Feb 2012 17:55:06 +0000 Subject: [PATCH] * Improved utf16 to ucs4 conversion routines: removed function call on every character, allocate memory once and made as much as possible code shared between WideString and UnicodeString versions. git-svn-id: trunk@20269 - --- rtl/inc/ustrings.inc | 121 ++++++++++++++++++------------------------- rtl/unix/cwstring.pp | 30 +++++++---- 2 files changed, 69 insertions(+), 82 deletions(-) diff --git a/rtl/inc/ustrings.inc b/rtl/inc/ustrings.inc index cbc35c457a..de742f2841 100644 --- a/rtl/inc/ustrings.inc +++ b/rtl/inc/ustrings.inc @@ -1388,38 +1388,6 @@ end; {$endif CPU64} -{ converts an utf-16 code point or surrogate pair to utf-32 } -function utf16toutf32(const S: UnicodeString; const index: SizeInt; out len: longint): UCS4Char; [public, alias: 'FPC_UTF16TOUTF32']; -var - w: unicodechar; -begin - { UTF-16 points in the range #$0-#$D7FF and #$E000-#$FFFF } - { are the same in UTF-32 } - w:=s[index]; - if (w<=#$d7ff) or - (w>=#$e000) then - begin - result:=UCS4Char(w); - len:=1; - end - { valid surrogate pair? } - else if (w<=#$dbff) and - { w>=#$d7ff check not needed, checked above } - (index=#$dc00) and - (s[index+1]<=#$dfff) then - { convert the surrogate pair to UTF-32 } - begin - result:=(UCS4Char(w)-$d800) shl 10 + (UCS4Char(s[index+1])-$dc00) + $10000; - len:=2; - end - else - { invalid surrogate -> do nothing } - begin - result:=UCS4Char(w); - len:=1; - end; -end; function UnicodeToUtf8(Dest: PChar; Source: PUnicodeChar; MaxBytes: SizeInt): SizeInt;{$ifdef SYSTEMINLINE}inline;{$endif} @@ -1870,26 +1838,60 @@ function Utf8ToAnsi(const s : RawByteString) : RawByteString;{$ifdef SYSTEMINLIN end; -function UnicodeStringToUCS4String(const s : UnicodeString) : UCS4String; +procedure UCS4Encode(p: PWideChar; len: sizeint; out res: UCS4String); var - i, slen, - destindex : SizeInt; - len : longint; + i, reslen: sizeint; + w: longint; begin - slen:=length(s); - setlength(result,slen+1); - i:=1; - destindex:=0; - while (i<=slen) do + reslen:=0; + i:=0; + { calculate required length } + while (i=#$e000) then + inc(i) + else if (p[i]<=#$dbff) and + (i+1=#$dc00) and + (p[i+1]<=#$dfff) then + inc(i,2) + else + inc(i); + inc(reslen); end; - { destindex <= slen (surrogate pairs may have been merged) } - { destindex+1 for terminating #0 (dynamic arrays are } - { implicitely filled with zero) } - setlength(result,destindex+1); + SetLength(res,reslen+1); { +1 for null termination } + reslen:=0; + i:=0; + { do conversion } + while (i=$e000) then + res[reslen]:=w + else if (w<=$dbff) and + (i+1=#$dc00) and + (p[i+1]<=#$dfff) then + begin + res[reslen]:=(UCS4Char(w-$d7c0) shl 10)+(UCS4Char(p[i+1]) xor $dc00); + inc(i); + end + else { invalid surrogate pair } + res[reslen]:=w; + inc(i); + inc(reslen); + end; + res[reslen]:=0; + end; + +function UnicodeStringToUCS4String(const s : UnicodeString) : UCS4String; + begin + UCS4Encode(PWideChar(s),Length(s),result); + end; + +function WideStringToUCS4String(const s : WideString) : UCS4String; + begin + UCS4Encode(PWideChar(s),Length(s),result); end; @@ -1942,29 +1944,6 @@ function UCS4StringToUnicodeString(const s : UCS4String) : UnicodeString; end; -function WideStringToUCS4String(const s : WideString) : UCS4String; - var - i, slen, - destindex : SizeInt; - len : longint; - begin - slen:=length(s); - setlength(result,slen+1); - i:=1; - destindex:=0; - while (i<=slen) do - begin - result[destindex]:=utf16toutf32(s,i,len); - inc(destindex); - inc(i,len); - end; - { destindex <= slen (surrogate pairs may have been merged) } - { destindex+1 for terminating #0 (dynamic arrays are } - { implicitely filled with zero) } - setlength(result,destindex+1); - end; - - { concatenates an utf-32 char to a widestring. S *must* be unique when entering. } procedure ConcatUTF32ToWideStr(const nc: UCS4Char; var S: WideString; var index: SizeInt); var diff --git a/rtl/unix/cwstring.pp b/rtl/unix/cwstring.pp index 3472674a48..3f9db534de 100644 --- a/rtl/unix/cwstring.pp +++ b/rtl/unix/cwstring.pp @@ -643,14 +643,10 @@ function UpperAnsiString(const s : AnsiString) : AnsiString; SetLength(result,resindex-1); end; - -function utf16toutf32(const S: WideString; const index: SizeInt; out len: longint): UCS4Char; external name 'FPC_UTF16TOUTF32'; - function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String; var i, slen, destindex : SizeInt; - len : longint; uch : UCS4Char; begin slen:=length(s); @@ -659,16 +655,28 @@ function WideStringToUCS4StringNoNulls(const s : WideString) : UCS4String; destindex:=0; while (i<=slen) do begin - uch:=utf16toutf32(s,i,len); - if (uch=UCS4Char(0)) then - uch:=UCS4Char(32); - result[destindex]:=uch; + uch:=UCS4Char(s[i]); + if (uch=0) then + result[destindex]:=32 + else if (uch<=$d7ff) or (uch>=$e000) then + result[destindex]:=uch + else if (uch<=$dbff) and + (i=#$dc00) and + (s[i+1]<=#$dfff) then + begin + result[destindex]:=(UCS4Char(uch-$d7c0) shl 10)+(UCS4Char(s[i+1]) xor $dc00); + inc(i); + end + else { invalid surrogate pair } + result[destindex]:=uch; + inc(i); inc(destindex); - inc(i,len); end; result[destindex]:=UCS4Char(0); - { destindex <= slen } - setlength(result,destindex+1); + { Trimming length in this particular case is just a waste of time, + because result will be interpreted as null-terminated and discarded + almost immediately } end;