Large overhaul to simplify the lowercase1 code

git-svn-id: trunk@32861 -
This commit is contained in:
sekelsenmat 2011-10-12 16:38:56 +00:00
parent 443e4b5478
commit 7914ea9804
2 changed files with 175 additions and 270 deletions

View File

@ -69,8 +69,7 @@ function UTF8LowerCaseMattias(const s: utf8string): utf8string;
{$endif}
function UTF8LowerCase(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
function UTF8LowerCase2(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
function UTF8UpperCase(const AInStr: utf8string): utf8string;
function UTF8UpperCase(const AInStr, ALocale: utf8string): utf8string;
function UTF8UpperCase(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
{function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
// StopOnNonASCII: Boolean = false): PtrInt;
//function ValidUTF8String(const s: String): String;
@ -1460,15 +1459,16 @@ begin
c := InStr^;
case c of // if NOT TABLE
#0:
begin
if InStr >= InStrEnd then
begin
if InStr >= InStrEnd then begin
SetLength(Result,OutStr - PChar(Result));
exit;
end;
OutStr^:=c;
inc(InStr);
inc(OutStr);
SetLength(Result,OutStr - PChar(Result));
exit;
end;
OutStr^:=c;
inc(InStr);
inc(OutStr);
end;
'A'..'Z':
begin
{ First ASCII chars }
@ -1526,7 +1526,6 @@ begin
end;
end; // Case InStr^
end;
end;
end;
@ -1537,14 +1536,17 @@ end;
Data from here: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
List of ranges which have lowercase:
$0041..$0061 ASCII
codepoints UTF-8 range Description
$0041..$0061 $41..$61 ASCII
$00C0..$00DE: Result:=UnicodeLower00C0_00DE[u];
$0100..$024E: Result:=UnicodeLower0100_024E[u];
$0386..$03AB: Result:=UnicodeLower0386_03AB[u];
$03D8..$042F: Result:=UnicodeLower03D8_042F[u];
$0460..$0512: Result:=UnicodeLower0460_0512[u];
$0531..$0556: Result:=u+48;
$10A0..$10C5 Georgian
$10A0..$10C5 E1 82 A0..E1 82 BF
E1 83 80..E1 83 85 Georgian
$1E00..$1FFC: Result:=UnicodeLower1E00_1FFC[u];
$2126..$2183: Result:=UnicodeLower2126_2183[u];
$24B6..$24CF: Result:=u+26;
@ -1558,7 +1560,7 @@ var
InStr, InStrEnd, OutStr: PChar;
// Language identification
IsTurkish: Boolean;
c, c2: Char;
c1, c2, c3, new_c1: Char;
begin
Result:=AInStr;
InStr := PChar(AInStr);
@ -1568,8 +1570,8 @@ begin
// UniqueString if the resulting string will be identical
while (InStr < InStrEnd) do
begin
c := InStr^;
case c of
c1 := InStr^;
case c1 of
'A'..'Z',#$C3, #$C4, #$C5..#$C8, #$CE, #$D0..#$D2, #$E1: Break;
// already lower, or otherwhise not affected
else
@ -1588,14 +1590,15 @@ begin
while InStr < InStrEnd do
begin
c := InStr^;
case c of // if NOT TABLE
c1 := InStr^;
new_c1 := c1;
case c1 of
'A'..'Z':
begin
{ First ASCII chars }
// Special turkish handling
// capital undotted I to small undotted i
if IsTurkish and (c = 'I') then
if IsTurkish and (c1 = 'I') then
begin
OutStr := PChar(OutStr - PChar(Result));
SetLength(Result,Length(Result)+1);// Increase the buffer
@ -1604,256 +1607,163 @@ begin
inc(OutStr);
OutStr^ := #$B1;
dec(CounterDiff);
inc(InStr);
inc(OutStr);
end
else
begin
OutStr^ := chr(ord(c)+32);
inc(InStr);
inc(OutStr);
OutStr^ := chr(ord(c1)+32);
end;
inc(InStr);
inc(OutStr);
end;
// Latin Characters 00000FFF http://en.wikibooks.org/wiki/Unicode/Character_reference/0000-0FFF
// $C380..$C39E: NewChar := OldChar + $20;
// $C39F: ß already lowercase
#$C3:
// Chars with 2-bytes which might be modified
#$C3, #$C4, #$C5..#$C8, #$CE, #$D0..#$D2:
begin
if InStr[1] in [#$80..#$9E] then
c2 := InStr[1];
case c1 of
// Latin Characters 00000FFF http://en.wikibooks.org/wiki/Unicode/Character_reference/0000-0FFF
// $C380..$C39E: NewChar := OldChar + $20;
// $C39F: ß already lowercase
#$C3:
begin
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(InStr[1]) + $20);
end
else if (CounterDiff <> 0) then
begin
OutStr^ :=InStr[0];
OutStr[1] :=InStr[1];
if c2 in [#$80..#$9E] then
c2 := chr(ord(c2) + $20);
end;
inc(InStr, 2);
inc(OutStr, 2);
end;
// $C481..$C4A9: if OldChar mod 2 = 0 then NewChar := OldChar + 1;
// Turkish capital dotted i to small dotted i
// $C4B0 -> 'i'
// $C4B1 turkish lowercase undotted ı
// $C4B2..$C4B6: if OldChar mod 2 = 0 then NewChar := OldChar + 1;
// $C4B7: ĸ => K ?
// $C4B8..$C588: if OldChar mod 2 = 1 then NewChar := OldChar + 1;
#$C4:
begin
c := InStr[1];
case c of
#$81..#$A9, #$B2..#$B6: //0
begin
if ord(c) mod 2 = 0 then
// $C481..$C4A9: if OldChar mod 2 = 0 then NewChar := OldChar + 1;
// Turkish capital dotted i to small dotted i
// $C4B0 -> 'i'
// $C4B1 turkish lowercase undotted ı
// $C4B2..$C4B6: if OldChar mod 2 = 0 then NewChar := OldChar + 1;
// $C4B7: ĸ => K ?
// $C4B8..$C588: if OldChar mod 2 = 1 then NewChar := OldChar + 1;
#$C4:
begin
case c2 of
#$81..#$A9, #$B2..#$B6: //0
begin
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(c) + 1);
end
else if (CounterDiff <> 0) then
begin
OutStr^ :=InStr[0];
OutStr[1] :=c;
if ord(c2) mod 2 = 0 then
c2 := chr(ord(c2) + 1);
end;
end;
#$B8..#$FF: //1
begin
if ord(c) mod 2 = 1 then
#$B8..#$FF: //1
begin
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(c) + 1);
end
else if (CounterDiff <> 0) then
begin
OutStr^ :=InStr[0];
OutStr[1] :=c;
if ord(c2) mod 2 = 1 then
c2 := chr(ord(c2) + 1);
end;
end;
#$B0:
begin
OutStr^ := 'i';
dec(OutStr);
inc(CounterDiff, 1);
end;
end;
inc(InStr, 2);
inc(OutStr, 2);
end;
// $C589 ʼn => ?
// $C58A..$C5B7: if OldChar mod 2 = 0 then NewChar := OldChar + 1;
// $C5B8: NewChar := $C3BF; // Ÿ
// $C5B9..$C8B3: if OldChar mod 2 = 1 then NewChar := OldChar + 1;
#$C5:
begin
c := InStr[1];
case c of
#$8A..#$B7: //0
begin
if ord(c) mod 2 = 0 then
#$B0:
begin
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(c) + 1);
end
else if (CounterDiff <> 0) then
begin
OutStr^ := InStr[0];
OutStr[1] :=c;
end;
end;
#$00..#$88, #$B9..#$FF: //1
begin
if ord(c) mod 2 = 1 then
begin
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(c) + 1);
end
else if (CounterDiff <> 0) then
begin
OutStr^ := InStr[0];
OutStr[1] :=c;
end;
end;
#$B8: // Ÿ
begin
OutStr[0] := #$C3;
OutStr[1] := #$BF;
end;
end;
inc(InStr, 2);
inc(OutStr, 2);
end;
#$C6..#$C7:
begin
c := InStr[1];
if ord(c) mod 2 = 1 then
begin
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(c) + 1);
end
else if (CounterDiff <> 0) then
begin
OutStr^ := InStr[0];
OutStr[1] :=c;
end;
inc(InStr, 2);
inc(OutStr, 2);
end;
#$C8:
begin
c := InStr[1];
if (c in [#$00..#$B3]) and (ord(c) mod 2 = 1) then
begin
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(c) + 1);
end
else if (CounterDiff <> 0) then
begin
OutStr^ := InStr[0];
OutStr[1] :=c;
end;
inc(InStr, 2);
inc(OutStr, 2);
end;
// $CE91..$CE9F: NewChar := OldChar + $20; // Greek Characters
// $CEA0..$CEA9: NewChar := OldChar + $E0; // Greek Characters
#$CE:
begin
c := InStr[1];
case c of
#$91..#$9F:
begin
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(c) + $20);
end;
#$A0..#$A9:
begin
OutStr^ := chr(ord(InStr[0])+1);
OutStr[1] := chr(ord(c) - $10);
end;
end;
inc(InStr, 2);
inc(OutStr, 2);
end;
// $D080..$D08F: NewChar := OldChar + $110; // Cyrillic alphabet
// $D090..$D09F: NewChar := OldChar + $20; // Cyrillic alphabet
// $D0A0..$D0AF: NewChar := OldChar + $E0; // Cyrillic alphabet
#$D0:
begin
c := InStr[1];
case c of
#$80..#$8F:
begin
OutStr^ := chr(ord(InStr[0])+1);
OutStr[1] := chr(ord(c) + $10);
end;
#$90..#$9F:
begin
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(c) + $20);
end;
#$A0..#$AF:
begin
OutStr^ := chr(ord(InStr[0])+1);
OutStr[1] := chr(ord(c) - $20);
end;
end;
inc(InStr, 2);
inc(OutStr, 2);
end;
// Archaic and non-slavic cyrillic 460-47F = D1A0-D1BF
// These require just adding 1 to get the lowercase
#$D1:
begin
c := InStr[1];
if (c in [#$A0..#$BF]) and (ord(c) mod 2 = 0) then
begin
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(c) + 1);
end
else if (CounterDiff <> 0) then
begin
OutStr^ := InStr[0];
OutStr[1] := c;
end;
inc(InStr, 2);
inc(OutStr, 2);
end;
// Archaic and non-slavic cyrillic 480-4BF = D280-D2BF
// These mostly require just adding 1 to get the lowercase
#$D2:
begin
c := InStr[1];
case c of
#$80:
begin
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(c) + 1);
end;
// #$81 is already lowercase
// #$82-#$89 ???
#$81..#$89:
begin
if (CounterDiff <> 0) then
begin
OutStr^ := InStr[0];
OutStr[1] := c;
end;
end;
#$8A..#$BF:
begin
if ord(c) mod 2 = 0 then
begin
if (CounterDiff <> 0) then OutStr^ := InStr[0];
OutStr[1] := chr(ord(c) + 1);
end
else if (CounterDiff <> 0) then
begin
OutStr^ := InStr[0];
OutStr[1] := c;
OutStr^ := 'i';
inc(InStr, 2);
inc(OutStr);
inc(CounterDiff, 1);
Continue;
end;
end;
end;
// $C589 ʼn => ?
// $C58A..$C5B7: if OldChar mod 2 = 0 then NewChar := OldChar + 1;
// $C5B8: NewChar := $C3BF; // Ÿ
// $C5B9..$C8B3: if OldChar mod 2 = 1 then NewChar := OldChar + 1;
#$C5:
begin
case c2 of
#$8A..#$B7: //0
begin
if ord(c2) mod 2 = 0 then
c2 := chr(ord(c2) + 1);
end;
#$00..#$88, #$B9..#$FF: //1
begin
if ord(c2) mod 2 = 1 then
c2 := chr(ord(c2) + 1);
end;
#$B8: // Ÿ
begin
new_c1 := #$C3;
c2 := #$BF;
end;
end;
end;
#$C6..#$C7:
begin
if ord(c2) mod 2 = 1 then
c2 := chr(ord(c2) + 1);
end;
#$C8:
begin
if (c2 in [#$00..#$B3]) and (ord(c2) mod 2 = 1) then
c2 := chr(ord(c2) + 1);
end;
// $CE91..$CE9F: NewChar := OldChar + $20; // Greek Characters
// $CEA0..$CEA9: NewChar := OldChar + $E0; // Greek Characters
#$CE:
begin
case c2 of
#$91..#$9F:
begin
c2 := chr(ord(c2) + $20);
end;
#$A0..#$A9:
begin
c2 := chr(ord(c2) - $10);
end;
end;
end;
// $D080..$D08F: NewChar := OldChar + $110; // Cyrillic alphabet
// $D090..$D09F: NewChar := OldChar + $20; // Cyrillic alphabet
// $D0A0..$D0AF: NewChar := OldChar + $E0; // Cyrillic alphabet
#$D0:
begin
c2 := InStr[1];
case c2 of
#$80..#$8F:
begin
new_c1 := chr(ord(c1)+1);
c2 := chr(ord(c2) + $10);
end;
#$90..#$9F:
begin
c2 := chr(ord(c2) + $20);
end;
#$A0..#$AF:
begin
new_c1 := chr(ord(c1)+1);
c2 := chr(ord(c2) - $20);
end;
end;
end;
// Archaic and non-slavic cyrillic 460-47F = D1A0-D1BF
// These require just adding 1 to get the lowercase
#$D1:
begin
if (c2 in [#$A0..#$BF]) and (ord(c2) mod 2 = 0) then
c2 := chr(ord(c2) + 1);
end;
// Archaic and non-slavic cyrillic 480-4BF = D280-D2BF
// These mostly require just adding 1 to get the lowercase
#$D2:
begin
case c2 of
#$80:
begin
c2 := chr(ord(c2) + 1);
end;
// #$81 is already lowercase
// #$82-#$89 ???
#$8A..#$BF:
begin
if ord(c2) mod 2 = 0 then
c2 := chr(ord(c2) + 1);
end;
end;
end;
end;
// Common code 2-byte modifiable chars
//if (CounterDiff <> 0) then
//begin
if (new_c1 <> c1) or (CounterDiff <> 0) then
OutStr^ := new_c1;
OutStr[1] := c2;
//end;
inc(InStr, 2);
inc(OutStr, 2);
end;
@ -1863,19 +1773,19 @@ begin
// E1 83 80 - E1 83 85 => E2 B4 A0 - E2 B4 A5
#$E1:
begin
c := InStr[1];
c2 := InStr[2];
if (c = #$82) and (c2 in [#$A0..#$BF]) then
c2 := InStr[1];
c3 := InStr[2];
if (c2 = #$82) and (c3 in [#$A0..#$BF]) then
begin
OutStr^ := #$E2;
OutStr[1] := #$B4;
OutStr[2] := chr(ord(c2) - $20);
OutStr[2] := chr(ord(c3) - $20);
end
else if (c = #$83) and (c2 in [#$80..#$85]) then
else if (c2 = #$83) and (c3 in [#$80..#$85]) then
begin
OutStr^ := #$E2;
OutStr[1] := #$B4;
OutStr[2] := chr(ord(c2) + $20);
OutStr[2] := chr(ord(c3) + $20);
end
else
begin
@ -1889,9 +1799,9 @@ begin
inc(InStr, 3);
inc(OutStr, 3);
end;
else
else
// Copy the character if the string was disaligned by previous changes
if (CounterDiff <> 0) then OutStr^:=c;
if (CounterDiff <> 0) then OutStr^:= c1;
inc(InStr);
inc(OutStr);
end; // Case InStr^
@ -1901,16 +1811,11 @@ begin
SetLength(Result,OutStr - PChar(Result));
end;
function UTF8UpperCase(const AInStr: utf8string): utf8string;
begin
Result := UTF8UpperCase(AInStr, '');
end;
{
AInStr - The input string
ALocale - The locale. Use '' for maximum speed if one desires to ignore the locale
}
function UTF8UpperCase(const AInStr, ALocale: utf8string): utf8string;
function UTF8UpperCase(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
var
i, InCounter, OutCounter: PtrInt;
OutStr: PChar;

View File

@ -151,7 +151,7 @@ begin
AssertStringOperationUTF8LowerCase('Offset Georgian UTF8LowerCase 2', 'tu', 'IႲⴒ Ⴓⴓ Ⴔⴔ Ⴕⴕ Ⴖⴖ Ⴗⴗ Ⴘⴘ Ⴙⴙ Ⴚⴚ Ⴛⴛ Ⴜⴜ Ⴝⴝ Ⴞⴞ Ⴟⴟ Ⴠⴠ Ⴡⴡ Ⴢⴢ Ⴣⴣ Ⴤⴤ Ⴥⴥ', 'ıⴒⴒ ⴓⴓ ⴔⴔ ⴕⴕ ⴖⴖ ⴗⴗ ⴘⴘ ⴙⴙ ⴚⴚ ⴛⴛ ⴜⴜ ⴝⴝ ⴞⴞ ⴟⴟ ⴠⴠ ⴡⴡ ⴢⴢ ⴣⴣ ⴤⴤ ⴥⴥ');
// Performance test
Write('Mattias LowerCase- Performance test took: ');
{ Write('Mattias LowerCase- Performance test took: ');
for j := 0 to 9 do begin
lStartTime := Now;
for i := 0 to TimerLoop do
@ -170,7 +170,7 @@ begin
lTimeDiff := Now - lStartTime;
Write(Format(' %7d ms ', [DateTimeToMilliseconds(lTimeDiff)]));
end;
writeln;
writeln; }
Write(' LowerCase-- Performance test took: ');
for j := 0 to 9 do begin
lStartTime := Now;
@ -191,7 +191,7 @@ begin
Write(Format(' %7d ms ', [DateTimeToMilliseconds(lTimeDiff)]));
end;
writeln;
Write(' LowerCase-- Performance test took: ');
Write(' LowerCase2-- Performance test took: ');
for j := 0 to 9 do begin
lStartTime := Now;
for i := 0 to TimerLoop do
@ -211,7 +211,7 @@ begin
Write(Format(' %7d ms ', [DateTimeToMilliseconds(lTimeDiff)]));
end;
writeln;
Write(' LowerCase-- Performance test took: ');
Write(' Turk LowerCase-- Performance test took: ');
for j := 0 to 9 do begin
lStartTime := Now;
for i := 0 to TimerLoop do
@ -231,7 +231,7 @@ begin
Write(Format(' %7d ms ', [DateTimeToMilliseconds(lTimeDiff)]));
end;
writeln;
Write(' LowerCase-- Performance test took: ');
Write(' Turk LowerCase2-- Performance test took: ');
for j := 0 to 9 do begin
lStartTime := Now;
for i := 0 to TimerLoop do