Large overhaul to simplify the lowercase1 code

git-svn-id: trunk@32861 -
This commit is contained in:
sekelsenmat 2011-10-12 16:38:56 +00:00
parent 443e4b5478
commit 7914ea9804
2 changed files with 175 additions and 270 deletions

View File

@ -69,8 +69,7 @@ function UTF8LowerCaseMattias(const s: utf8string): utf8string;
{$endif} {$endif}
function UTF8LowerCase(const AInStr: utf8string; ALocale: utf8string=''): utf8string; function UTF8LowerCase(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
function UTF8LowerCase2(const AInStr: utf8string; ALocale: utf8string=''): utf8string; function UTF8LowerCase2(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
function UTF8UpperCase(const AInStr: utf8string): utf8string; function UTF8UpperCase(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
function UTF8UpperCase(const AInStr, ALocale: utf8string): utf8string;
{function FindInvalidUTF8Character(p: PChar; Count: PtrInt; {function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
// StopOnNonASCII: Boolean = false): PtrInt; // StopOnNonASCII: Boolean = false): PtrInt;
//function ValidUTF8String(const s: String): String; //function ValidUTF8String(const s: String): String;
@ -1461,7 +1460,8 @@ begin
case c of // if NOT TABLE case c of // if NOT TABLE
#0: #0:
begin begin
if InStr >= InStrEnd then begin if InStr >= InStrEnd then
begin
SetLength(Result,OutStr - PChar(Result)); SetLength(Result,OutStr - PChar(Result));
exit; exit;
end; end;
@ -1526,7 +1526,6 @@ begin
end; end;
end; // Case InStr^ end; // Case InStr^
end; end;
end; end;
end; end;
@ -1537,14 +1536,17 @@ end;
Data from here: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt Data from here: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
List of ranges which have lowercase: List of ranges which have lowercase:
$0041..$0061 ASCII
codepoints UTF-8 range Description
$0041..$0061 $41..$61 ASCII
$00C0..$00DE: Result:=UnicodeLower00C0_00DE[u]; $00C0..$00DE: Result:=UnicodeLower00C0_00DE[u];
$0100..$024E: Result:=UnicodeLower0100_024E[u]; $0100..$024E: Result:=UnicodeLower0100_024E[u];
$0386..$03AB: Result:=UnicodeLower0386_03AB[u]; $0386..$03AB: Result:=UnicodeLower0386_03AB[u];
$03D8..$042F: Result:=UnicodeLower03D8_042F[u]; $03D8..$042F: Result:=UnicodeLower03D8_042F[u];
$0460..$0512: Result:=UnicodeLower0460_0512[u]; $0460..$0512: Result:=UnicodeLower0460_0512[u];
$0531..$0556: Result:=u+48; $0531..$0556: Result:=u+48;
$10A0..$10C5 Georgian $10A0..$10C5 E1 82 A0..E1 82 BF
E1 83 80..E1 83 85 Georgian
$1E00..$1FFC: Result:=UnicodeLower1E00_1FFC[u]; $1E00..$1FFC: Result:=UnicodeLower1E00_1FFC[u];
$2126..$2183: Result:=UnicodeLower2126_2183[u]; $2126..$2183: Result:=UnicodeLower2126_2183[u];
$24B6..$24CF: Result:=u+26; $24B6..$24CF: Result:=u+26;
@ -1558,7 +1560,7 @@ var
InStr, InStrEnd, OutStr: PChar; InStr, InStrEnd, OutStr: PChar;
// Language identification // Language identification
IsTurkish: Boolean; IsTurkish: Boolean;
c, c2: Char; c1, c2, c3, new_c1: Char;
begin begin
Result:=AInStr; Result:=AInStr;
InStr := PChar(AInStr); InStr := PChar(AInStr);
@ -1568,8 +1570,8 @@ begin
// UniqueString if the resulting string will be identical // UniqueString if the resulting string will be identical
while (InStr < InStrEnd) do while (InStr < InStrEnd) do
begin begin
c := InStr^; c1 := InStr^;
case c of case c1 of
'A'..'Z',#$C3, #$C4, #$C5..#$C8, #$CE, #$D0..#$D2, #$E1: Break; 'A'..'Z',#$C3, #$C4, #$C5..#$C8, #$CE, #$D0..#$D2, #$E1: Break;
// already lower, or otherwhise not affected // already lower, or otherwhise not affected
else else
@ -1588,14 +1590,15 @@ begin
while InStr < InStrEnd do while InStr < InStrEnd do
begin begin
c := InStr^; c1 := InStr^;
case c of // if NOT TABLE new_c1 := c1;
case c1 of
'A'..'Z': 'A'..'Z':
begin begin
{ First ASCII chars } { First ASCII chars }
// Special turkish handling // Special turkish handling
// capital undotted I to small undotted i // capital undotted I to small undotted i
if IsTurkish and (c = 'I') then if IsTurkish and (c1 = 'I') then
begin begin
OutStr := PChar(OutStr - PChar(Result)); OutStr := PChar(OutStr - PChar(Result));
SetLength(Result,Length(Result)+1);// Increase the buffer SetLength(Result,Length(Result)+1);// Increase the buffer
@ -1604,34 +1607,27 @@ begin
inc(OutStr); inc(OutStr);
OutStr^ := #$B1; OutStr^ := #$B1;
dec(CounterDiff); dec(CounterDiff);
inc(InStr);
inc(OutStr);
end end
else else
begin begin
OutStr^ := chr(ord(c)+32); OutStr^ := chr(ord(c1)+32);
end;
inc(InStr); inc(InStr);
inc(OutStr); inc(OutStr);
end; end;
end;
// Chars with 2-bytes which might be modified
#$C3, #$C4, #$C5..#$C8, #$CE, #$D0..#$D2:
begin
c2 := InStr[1];
case c1 of
// Latin Characters 00000FFF http://en.wikibooks.org/wiki/Unicode/Character_reference/0000-0FFF // Latin Characters 00000FFF http://en.wikibooks.org/wiki/Unicode/Character_reference/0000-0FFF
// $C380..$C39E: NewChar := OldChar + $20; // $C380..$C39E: NewChar := OldChar + $20;
// $C39F: ß already lowercase // $C39F: ß already lowercase
#$C3: #$C3:
begin begin
if InStr[1] in [#$80..#$9E] then if c2 in [#$80..#$9E] then
begin c2 := chr(ord(c2) + $20);
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(InStr[1]) + $20);
end
else if (CounterDiff <> 0) then
begin
OutStr^ :=InStr[0];
OutStr[1] :=InStr[1];
end;
inc(InStr, 2);
inc(OutStr, 2);
end; end;
// $C481..$C4A9: if OldChar mod 2 = 0 then NewChar := OldChar + 1; // $C481..$C4A9: if OldChar mod 2 = 0 then NewChar := OldChar + 1;
// Turkish capital dotted i to small dotted i // Turkish capital dotted i to small dotted i
@ -1642,43 +1638,26 @@ begin
// $C4B8..$C588: if OldChar mod 2 = 1 then NewChar := OldChar + 1; // $C4B8..$C588: if OldChar mod 2 = 1 then NewChar := OldChar + 1;
#$C4: #$C4:
begin begin
c := InStr[1]; case c2 of
case c of
#$81..#$A9, #$B2..#$B6: //0 #$81..#$A9, #$B2..#$B6: //0
begin begin
if ord(c) mod 2 = 0 then if ord(c2) mod 2 = 0 then
begin c2 := chr(ord(c2) + 1);
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(c) + 1);
end
else if (CounterDiff <> 0) then
begin
OutStr^ :=InStr[0];
OutStr[1] :=c;
end;
end; end;
#$B8..#$FF: //1 #$B8..#$FF: //1
begin begin
if ord(c) mod 2 = 1 then if ord(c2) mod 2 = 1 then
begin c2 := chr(ord(c2) + 1);
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(c) + 1);
end
else if (CounterDiff <> 0) then
begin
OutStr^ :=InStr[0];
OutStr[1] :=c;
end;
end; end;
#$B0: #$B0:
begin begin
OutStr^ := 'i'; OutStr^ := 'i';
dec(OutStr);
inc(CounterDiff, 1);
end;
end;
inc(InStr, 2); inc(InStr, 2);
inc(OutStr, 2); inc(OutStr);
inc(CounterDiff, 1);
Continue;
end;
end;
end; end;
// $C589 ʼn => ? // $C589 ʼn => ?
// $C58A..$C5B7: if OldChar mod 2 = 0 then NewChar := OldChar + 1; // $C58A..$C5B7: if OldChar mod 2 = 0 then NewChar := OldChar + 1;
@ -1686,174 +1665,105 @@ begin
// $C5B9..$C8B3: if OldChar mod 2 = 1 then NewChar := OldChar + 1; // $C5B9..$C8B3: if OldChar mod 2 = 1 then NewChar := OldChar + 1;
#$C5: #$C5:
begin begin
c := InStr[1]; case c2 of
case c of
#$8A..#$B7: //0 #$8A..#$B7: //0
begin begin
if ord(c) mod 2 = 0 then if ord(c2) mod 2 = 0 then
begin c2 := chr(ord(c2) + 1);
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(c) + 1);
end
else if (CounterDiff <> 0) then
begin
OutStr^ := InStr[0];
OutStr[1] :=c;
end;
end; end;
#$00..#$88, #$B9..#$FF: //1 #$00..#$88, #$B9..#$FF: //1
begin begin
if ord(c) mod 2 = 1 then if ord(c2) mod 2 = 1 then
begin c2 := chr(ord(c2) + 1);
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(c) + 1);
end
else if (CounterDiff <> 0) then
begin
OutStr^ := InStr[0];
OutStr[1] :=c;
end;
end; end;
#$B8: // Ÿ #$B8: // Ÿ
begin begin
OutStr[0] := #$C3; new_c1 := #$C3;
OutStr[1] := #$BF; c2 := #$BF;
end; end;
end; end;
inc(InStr, 2);
inc(OutStr, 2);
end; end;
#$C6..#$C7: #$C6..#$C7:
begin begin
c := InStr[1]; if ord(c2) mod 2 = 1 then
if ord(c) mod 2 = 1 then c2 := chr(ord(c2) + 1);
begin
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(c) + 1);
end
else if (CounterDiff <> 0) then
begin
OutStr^ := InStr[0];
OutStr[1] :=c;
end;
inc(InStr, 2);
inc(OutStr, 2);
end; end;
#$C8: #$C8:
begin begin
c := InStr[1]; if (c2 in [#$00..#$B3]) and (ord(c2) mod 2 = 1) then
if (c in [#$00..#$B3]) and (ord(c) mod 2 = 1) then c2 := chr(ord(c2) + 1);
begin
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(c) + 1);
end
else if (CounterDiff <> 0) then
begin
OutStr^ := InStr[0];
OutStr[1] :=c;
end;
inc(InStr, 2);
inc(OutStr, 2);
end; end;
// $CE91..$CE9F: NewChar := OldChar + $20; // Greek Characters // $CE91..$CE9F: NewChar := OldChar + $20; // Greek Characters
// $CEA0..$CEA9: NewChar := OldChar + $E0; // Greek Characters // $CEA0..$CEA9: NewChar := OldChar + $E0; // Greek Characters
#$CE: #$CE:
begin begin
c := InStr[1]; case c2 of
case c of
#$91..#$9F: #$91..#$9F:
begin begin
if (CounterDiff <> 0) then OutStr^ := InStr[0]; c2 := chr(ord(c2) + $20);
OutStr[1] := chr(ord(c) + $20);
end; end;
#$A0..#$A9: #$A0..#$A9:
begin begin
OutStr^ := chr(ord(InStr[0])+1); c2 := chr(ord(c2) - $10);
OutStr[1] := chr(ord(c) - $10);
end; end;
end; end;
inc(InStr, 2);
inc(OutStr, 2);
end; end;
// $D080..$D08F: NewChar := OldChar + $110; // Cyrillic alphabet // $D080..$D08F: NewChar := OldChar + $110; // Cyrillic alphabet
// $D090..$D09F: NewChar := OldChar + $20; // Cyrillic alphabet // $D090..$D09F: NewChar := OldChar + $20; // Cyrillic alphabet
// $D0A0..$D0AF: NewChar := OldChar + $E0; // Cyrillic alphabet // $D0A0..$D0AF: NewChar := OldChar + $E0; // Cyrillic alphabet
#$D0: #$D0:
begin begin
c := InStr[1]; c2 := InStr[1];
case c of case c2 of
#$80..#$8F: #$80..#$8F:
begin begin
OutStr^ := chr(ord(InStr[0])+1); new_c1 := chr(ord(c1)+1);
OutStr[1] := chr(ord(c) + $10); c2 := chr(ord(c2) + $10);
end; end;
#$90..#$9F: #$90..#$9F:
begin begin
if (CounterDiff <> 0) then OutStr^ := InStr[0]; c2 := chr(ord(c2) + $20);
OutStr[1] := chr(ord(c) + $20);
end; end;
#$A0..#$AF: #$A0..#$AF:
begin begin
OutStr^ := chr(ord(InStr[0])+1); new_c1 := chr(ord(c1)+1);
OutStr[1] := chr(ord(c) - $20); c2 := chr(ord(c2) - $20);
end; end;
end; end;
inc(InStr, 2);
inc(OutStr, 2);
end; end;
// Archaic and non-slavic cyrillic 460-47F = D1A0-D1BF // Archaic and non-slavic cyrillic 460-47F = D1A0-D1BF
// These require just adding 1 to get the lowercase // These require just adding 1 to get the lowercase
#$D1: #$D1:
begin begin
c := InStr[1]; if (c2 in [#$A0..#$BF]) and (ord(c2) mod 2 = 0) then
if (c in [#$A0..#$BF]) and (ord(c) mod 2 = 0) then c2 := chr(ord(c2) + 1);
begin
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(c) + 1);
end
else if (CounterDiff <> 0) then
begin
OutStr^ := InStr[0];
OutStr[1] := c;
end;
inc(InStr, 2);
inc(OutStr, 2);
end; end;
// Archaic and non-slavic cyrillic 480-4BF = D280-D2BF // Archaic and non-slavic cyrillic 480-4BF = D280-D2BF
// These mostly require just adding 1 to get the lowercase // These mostly require just adding 1 to get the lowercase
#$D2: #$D2:
begin begin
c := InStr[1]; case c2 of
case c of
#$80: #$80:
begin begin
if (CounterDiff <> 0) then OutStr^ := InStr[0]; c2 := chr(ord(c2) + 1);
OutStr[1] := chr(ord(c) + 1);
end; end;
// #$81 is already lowercase // #$81 is already lowercase
// #$82-#$89 ??? // #$82-#$89 ???
#$81..#$89:
begin
if (CounterDiff <> 0) then
begin
OutStr^ := InStr[0];
OutStr[1] := c;
end;
end;
#$8A..#$BF: #$8A..#$BF:
begin begin
if ord(c) mod 2 = 0 then if ord(c2) mod 2 = 0 then
begin c2 := chr(ord(c2) + 1);
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(c) + 1);
end
else if (CounterDiff <> 0) then
begin
OutStr^ := InStr[0];
OutStr[1] := c;
end; end;
end; end;
end; end;
end;
// Common code 2-byte modifiable chars
//if (CounterDiff <> 0) then
//begin
if (new_c1 <> c1) or (CounterDiff <> 0) then
OutStr^ := new_c1;
OutStr[1] := c2;
//end;
inc(InStr, 2); inc(InStr, 2);
inc(OutStr, 2); inc(OutStr, 2);
end; end;
@ -1863,19 +1773,19 @@ begin
// E1 83 80 - E1 83 85 => E2 B4 A0 - E2 B4 A5 // E1 83 80 - E1 83 85 => E2 B4 A0 - E2 B4 A5
#$E1: #$E1:
begin begin
c := InStr[1]; c2 := InStr[1];
c2 := InStr[2]; c3 := InStr[2];
if (c = #$82) and (c2 in [#$A0..#$BF]) then if (c2 = #$82) and (c3 in [#$A0..#$BF]) then
begin begin
OutStr^ := #$E2; OutStr^ := #$E2;
OutStr[1] := #$B4; OutStr[1] := #$B4;
OutStr[2] := chr(ord(c2) - $20); OutStr[2] := chr(ord(c3) - $20);
end end
else if (c = #$83) and (c2 in [#$80..#$85]) then else if (c2 = #$83) and (c3 in [#$80..#$85]) then
begin begin
OutStr^ := #$E2; OutStr^ := #$E2;
OutStr[1] := #$B4; OutStr[1] := #$B4;
OutStr[2] := chr(ord(c2) + $20); OutStr[2] := chr(ord(c3) + $20);
end end
else else
begin begin
@ -1891,7 +1801,7 @@ begin
end; end;
else else
// Copy the character if the string was disaligned by previous changes // Copy the character if the string was disaligned by previous changes
if (CounterDiff <> 0) then OutStr^:=c; if (CounterDiff <> 0) then OutStr^:= c1;
inc(InStr); inc(InStr);
inc(OutStr); inc(OutStr);
end; // Case InStr^ end; // Case InStr^
@ -1901,16 +1811,11 @@ begin
SetLength(Result,OutStr - PChar(Result)); SetLength(Result,OutStr - PChar(Result));
end; end;
function UTF8UpperCase(const AInStr: utf8string): utf8string;
begin
Result := UTF8UpperCase(AInStr, '');
end;
{ {
AInStr - The input string AInStr - The input string
ALocale - The locale. Use '' for maximum speed if one desires to ignore the locale ALocale - The locale. Use '' for maximum speed if one desires to ignore the locale
} }
function UTF8UpperCase(const AInStr, ALocale: utf8string): utf8string; function UTF8UpperCase(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
var var
i, InCounter, OutCounter: PtrInt; i, InCounter, OutCounter: PtrInt;
OutStr: PChar; OutStr: PChar;

View File

@ -151,7 +151,7 @@ begin
AssertStringOperationUTF8LowerCase('Offset Georgian UTF8LowerCase 2', 'tu', 'IႲⴒ Ⴓⴓ Ⴔⴔ Ⴕⴕ Ⴖⴖ Ⴗⴗ Ⴘⴘ Ⴙⴙ Ⴚⴚ Ⴛⴛ Ⴜⴜ Ⴝⴝ Ⴞⴞ Ⴟⴟ Ⴠⴠ Ⴡⴡ Ⴢⴢ Ⴣⴣ Ⴤⴤ Ⴥⴥ', 'ıⴒⴒ ⴓⴓ ⴔⴔ ⴕⴕ ⴖⴖ ⴗⴗ ⴘⴘ ⴙⴙ ⴚⴚ ⴛⴛ ⴜⴜ ⴝⴝ ⴞⴞ ⴟⴟ ⴠⴠ ⴡⴡ ⴢⴢ ⴣⴣ ⴤⴤ ⴥⴥ'); AssertStringOperationUTF8LowerCase('Offset Georgian UTF8LowerCase 2', 'tu', 'IႲⴒ Ⴓⴓ Ⴔⴔ Ⴕⴕ Ⴖⴖ Ⴗⴗ Ⴘⴘ Ⴙⴙ Ⴚⴚ Ⴛⴛ Ⴜⴜ Ⴝⴝ Ⴞⴞ Ⴟⴟ Ⴠⴠ Ⴡⴡ Ⴢⴢ Ⴣⴣ Ⴤⴤ Ⴥⴥ', 'ıⴒⴒ ⴓⴓ ⴔⴔ ⴕⴕ ⴖⴖ ⴗⴗ ⴘⴘ ⴙⴙ ⴚⴚ ⴛⴛ ⴜⴜ ⴝⴝ ⴞⴞ ⴟⴟ ⴠⴠ ⴡⴡ ⴢⴢ ⴣⴣ ⴤⴤ ⴥⴥ');
// Performance test // Performance test
Write('Mattias LowerCase- Performance test took: '); { Write('Mattias LowerCase- Performance test took: ');
for j := 0 to 9 do begin for j := 0 to 9 do begin
lStartTime := Now; lStartTime := Now;
for i := 0 to TimerLoop do for i := 0 to TimerLoop do
@ -170,7 +170,7 @@ begin
lTimeDiff := Now - lStartTime; lTimeDiff := Now - lStartTime;
Write(Format(' %7d ms ', [DateTimeToMilliseconds(lTimeDiff)])); Write(Format(' %7d ms ', [DateTimeToMilliseconds(lTimeDiff)]));
end; end;
writeln; writeln; }
Write(' LowerCase-- Performance test took: '); Write(' LowerCase-- Performance test took: ');
for j := 0 to 9 do begin for j := 0 to 9 do begin
lStartTime := Now; lStartTime := Now;
@ -191,7 +191,7 @@ begin
Write(Format(' %7d ms ', [DateTimeToMilliseconds(lTimeDiff)])); Write(Format(' %7d ms ', [DateTimeToMilliseconds(lTimeDiff)]));
end; end;
writeln; writeln;
Write(' LowerCase-- Performance test took: '); Write(' LowerCase2-- Performance test took: ');
for j := 0 to 9 do begin for j := 0 to 9 do begin
lStartTime := Now; lStartTime := Now;
for i := 0 to TimerLoop do for i := 0 to TimerLoop do
@ -211,7 +211,7 @@ begin
Write(Format(' %7d ms ', [DateTimeToMilliseconds(lTimeDiff)])); Write(Format(' %7d ms ', [DateTimeToMilliseconds(lTimeDiff)]));
end; end;
writeln; writeln;
Write(' LowerCase-- Performance test took: '); Write(' Turk LowerCase-- Performance test took: ');
for j := 0 to 9 do begin for j := 0 to 9 do begin
lStartTime := Now; lStartTime := Now;
for i := 0 to TimerLoop do for i := 0 to TimerLoop do
@ -231,7 +231,7 @@ begin
Write(Format(' %7d ms ', [DateTimeToMilliseconds(lTimeDiff)])); Write(Format(' %7d ms ', [DateTimeToMilliseconds(lTimeDiff)]));
end; end;
writeln; writeln;
Write(' LowerCase-- Performance test took: '); Write(' Turk LowerCase2-- Performance test took: ');
for j := 0 to 9 do begin for j := 0 to 9 do begin
lStartTime := Now; lStartTime := Now;
for i := 0 to TimerLoop do for i := 0 to TimerLoop do