Completes lowercase up to codepoint 1EFF and adds many tests

git-svn-id: trunk@32880 -
This commit is contained in:
sekelsenmat 2011-10-14 09:53:23 +00:00
parent c3ae437f84
commit d6cddfc645
2 changed files with 220 additions and 29 deletions

View File

@ -1536,7 +1536,7 @@ var
InStr, InStrEnd, OutStr: PChar;
// Language identification
IsTurkish: Boolean;
c1, c2, c3, new_c1, new_c2: Char;
c1, c2, c3, new_c1, new_c2, new_c3: Char;
begin
Result:=AInStr;
InStr := PChar(AInStr);
@ -1549,7 +1549,7 @@ begin
c1 := InStr^;
case c1 of
'A'..'Z': Break;
#$C3, #$C4, #$C5..#$C8, #$CE, #$D0..#$D2, #$E1:
#$C3..#$C9, #$CE, #$D0..#$D5, #$E1:
begin
c2 := InStr[1];
case c1 of
@ -1570,9 +1570,10 @@ begin
#$B8: Break;
end;
end;
#$C6..#$C8,#$CE, #$D0..#$D2, #$E1: Break;
// already lower, or otherwhise not affected
else
Break;
end;
// already lower, or otherwhise not affected
end;
end;
inc(InStr);
@ -1617,8 +1618,7 @@ begin
end;
// Chars with 2-bytes which might be modified
//#$C3..#$C8, #$CE, #$D0..#$D2:
#$C3..#$D2:
#$C3..#$D5:
begin
c2 := InStr[1];
new_c1 := c1;
@ -1655,7 +1655,12 @@ begin
inc(CounterDiff, 1);
Continue;
end;
#$B9..#$BF: if ord(c2) mod 2 = 1 then new_c2 := chr(ord(c2) + 1);
#$B9..#$BE: if ord(c2) mod 2 = 1 then new_c2 := chr(ord(c2) + 1);
#$BF: // This crosses the borders between the first byte of the UTF-8 char
begin
new_c1 := #$C5;
new_c2 := #$80;
end;
end;
end;
// $C589 ʼn
@ -1670,7 +1675,7 @@ begin
if ord(c2) mod 2 = 0 then
new_c2 := chr(ord(c2) + 1);
end;
#$00..#$88, #$B9..#$FF: //1
#$00..#$88, #$B9..#$BE: //1
begin
if ord(c2) mod 2 = 1 then
new_c2 := chr(ord(c2) + 1);
@ -1814,7 +1819,7 @@ begin
01A3;LATIN SMALL LETTER OI;Ll;0;L;;;;;N;LATIN SMALL LETTER O I;;01A2;;01A2 <=
01A4;LATIN CAPITAL LETTER P WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER P HOOK;;;01A5; => +1
01A5;LATIN SMALL LETTER P WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER P HOOK;;01A4;;01A4 <=
01A6;LATIN LETTER YR;Lu;0;L;;;;;N;LATIN LETTER Y R;;;0280; <=
01A6;LATIN LETTER YR;Lu;0;L;;;;;N;LATIN LETTER Y R;;;0280; => CA 80
01A7;LATIN CAPITAL LETTER TONE TWO;Lu;0;L;;;;;N;;;;01A8; => +1
01A8;LATIN SMALL LETTER TONE TWO;Ll;0;L;;;;;N;;;01A7;;01A7 <=
01A9;LATIN CAPITAL LETTER ESH;Lu;0;L;;;;;N;;;;0283; => CA 83
@ -1835,6 +1840,11 @@ begin
if ord(c2) mod 2 = 1 then
new_c2 := chr(ord(c2) + 1);
end;
#$A6:
begin
new_c1 := #$CA;
new_c2 := #$80;
end;
#$A9:
begin
new_c1 := #$CA;
@ -1974,6 +1984,41 @@ begin
#$BB: new_c2 := chr(ord(c2) + 1);
end;
end;
{
Codepoints 0240 to 027F
Here only 0240..024F needs lowercase
}
#$C9:
begin
case c2 of
#$81..#$82:
begin
if ord(c2) mod 2 = 1 then
new_c2 := chr(ord(c2) + 1);
end;
#$86..#$8F:
begin
if ord(c2) mod 2 = 0 then
new_c2 := chr(ord(c2) + 1);
end;
#$83:
begin
new_c1 := #$C6;
new_c2 := #$80;
end;
#$84:
begin
new_c1 := #$CA;
new_c2 := #$89;
end;
#$85:
begin
new_c1 := #$CA;
new_c2 := #$8C;
end;
end;
end;
// $CE91..$CE9F: NewChar := OldChar + $20; // Greek Characters
// $CEA0..$CEA9: NewChar := OldChar + $E0; // Greek Characters
#$CE:
@ -2037,6 +2082,61 @@ begin
end;
end;
end;
{
Codepoints 04C0..04FF
}
#$D3:
begin
case c2 of
#$80: new_c2 := #$8F;
#$81..#$8E:
begin
if ord(c2) mod 2 = 1 then
new_c2 := chr(ord(c2) + 1);
end;
#$90..#$BF:
begin
if ord(c2) mod 2 = 0 then
new_c2 := chr(ord(c2) + 1);
end;
end;
end;
{
Codepoints 0500..053F
Armenian starts in 0531
}
#$D4:
begin
if ord(c2) mod 2 = 0 then
new_c2 := chr(ord(c2) + 1);
// Armenian
if c2 in [#$B1..#$BF] then
begin
new_c1 := #$D5;
new_c2 := chr(ord(c2) - $10);
end;
end;
{
Codepoints 0540..057F
Armenian
}
#$D5:
begin
case c2 of
#$80..#$8F:
begin
new_c2 := chr(ord(c2) + $30);
end;
#$90..#$96:
begin
new_c1 := #$D6;
new_c2 := chr(ord(c2) - $10);
end;
end;
end;
end;
// Common code 2-byte modifiable chars
if (CounterDiff <> 0) then
@ -2052,36 +2152,90 @@ begin
inc(InStr, 2);
inc(OutStr, 2);
end;
// Georgian codepoints 10A0-10C5 => 2D00-2D25
// In UTF-8 this is:
// E1 82 A0 - E1 82 BF => E2 B4 80 - E2 B4 9F
// E1 83 80 - E1 83 85 => E2 B4 A0 - E2 B4 A5
{
Characters with 3 bytes
}
#$E1:
begin
new_c1 := c1;
c2 := InStr[1];
c3 := InStr[2];
new_c2 := c2;
new_c3 := c3;
{
Georgian codepoints 10A0-10C5 => 2D00-2D25
In UTF-8 this is:
E1 82 A0 - E1 82 BF => E2 B4 80 - E2 B4 9F
E1 83 80 - E1 83 85 => E2 B4 A0 - E2 B4 A5
}
if (c2 = #$82) and (c3 in [#$A0..#$BF]) then
begin
OutStr^ := #$E2;
OutStr[1] := #$B4;
OutStr[2] := chr(ord(c3) - $20);
new_c1 := #$E2;
new_c2 := #$B4;
new_c3 := chr(ord(c3) - $20);
end
else if (c2 = #$83) and (c3 in [#$80..#$85]) then
begin
OutStr^ := #$E2;
OutStr[1] := #$B4;
OutStr[2] := chr(ord(c3) + $20);
new_c1 := #$E2;
new_c2 := #$B4;
new_c3 := chr(ord(c3) + $20);
end
{
Extra chars between 1E00..1EFF
Blocks of chars:
1E00..1E3F E1 B8 80..E1 B8 BF
1E40..1E7F E1 B9 80..E1 B9 BF
1E80..1EBF E1 BA 80..E1 BA BF
1EC0..1EFF E1 BB 80..E1 BB BF
}
else if c2 in [#$B8..#$BB] then
begin
// Start with a default and change for some particular chars
if ord(c3) mod 2 = 0 then
new_c3 := chr(ord(c3) + 1);
{ Only 1E96..1E9F are different E1 BA 96..E1 BA 9F
1E96;LATIN SMALL LETTER H WITH LINE BELOW;Ll;0;L;0068 0331;;;;N;;;;;
1E97;LATIN SMALL LETTER T WITH DIAERESIS;Ll;0;L;0074 0308;;;;N;;;;;
1E98;LATIN SMALL LETTER W WITH RING ABOVE;Ll;0;L;0077 030A;;;;N;;;;;
1E99;LATIN SMALL LETTER Y WITH RING ABOVE;Ll;0;L;0079 030A;;;;N;;;;;
1E9A;LATIN SMALL LETTER A WITH RIGHT HALF RING;Ll;0;L;<compat> 0061 02BE;;;;N;;;;;
1E9B;LATIN SMALL LETTER LONG S WITH DOT ABOVE;Ll;0;L;017F 0307;;;;N;;;1E60;;1E60
1E9C;LATIN SMALL LETTER LONG S WITH DIAGONAL STROKE;Ll;0;L;;;;;N;;;;;
1E9D;LATIN SMALL LETTER LONG S WITH HIGH STROKE;Ll;0;L;;;;;N;;;;;
1E9E;LATIN CAPITAL LETTER SHARP S;Lu;0;L;;;;;N;;;;00DF; => C3 9F
1E9F;LATIN SMALL LETTER DELTA;Ll;0;L;;;;;N;;;;;
}
if (c2 = #$BA) and (c3 in [#$96..#$9F]) then new_c3 := c3;
// LATIN CAPITAL LETTER SHARP S => to german Beta
if (c2 = #$BA) and (c3 = #$9E) then
begin
inc(InStr, 3);
OutStr^ := #$C3;
inc(OutStr);
OutStr^ := #$9F;
inc(OutStr);
inc(CounterDiff, 1);
Continue;
end;
end;
if (CounterDiff <> 0) then
begin
OutStr^ := new_c1;
OutStr[1] := new_c2;
OutStr[2] := new_c3;
end
else
begin
if (CounterDiff <> 0) then
begin
OutStr^ := InStr[0];
OutStr[1] := InStr[1];
OutStr[2] := InStr[2];
end;
if c1 <> new_c1 then OutStr^ := new_c1;
if c2 <> new_c2 then OutStr[1] := new_c2;
if c3 <> new_c3 then OutStr[2] := new_c3;
end;
inc(InStr, 3);
inc(OutStr, 3);
end;

View File

@ -25,13 +25,13 @@ begin
begin
Write(' Expected ', AStrExpected2, ' !Error!');
WriteLn();
Write('Got Len=', Length(AStr2),' ');
Write('Got Len=', Length(AStr2), ' Str=');
WriteStringHex(AStr2);
WriteLn('');
Write('Expected Len=', Length(AStrExpected2),' ');
Write('Expected Len=', Length(AStrExpected2), ' Str=');
WriteStringHex(AStrExpected2);
WriteLn();
Write('Orig Len=', Length(AStr1),' ');
Write('Orig Len=', Length(AStr1), ' Str=');
WriteStringHex(AStr1);
WriteLn('');
end;
@ -112,21 +112,58 @@ begin
AssertStringOperationUTF8LowerCase('Polish UTF8LowerCase 1', '', 'aąbcćdeęfghijklłmnńoóprsśtuwyzźż', 'aąbcćdeęfghijklłmnńoóprsśtuwyzźż');
AssertStringOperationUTF8LowerCase('Polish UTF8LowerCase 2', '', 'AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ', 'aąbcćdeęfghijklłmnńoóprsśtuwyzźż');
AssertStringOperationUTF8LowerCase('German UTF8LowerCase 1', '', 'Ä/ä,Ö/ö,Ü/ü,ß', 'ä/ä,ö/ö,ü/ü,ß');
// Unicode table
AssertStringOperationUTF8LowerCase('Latin 00C0 UTF8LowerCase', '', 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ', 'àáâãäåæçèéêëìíîï');
AssertStringOperationUTF8LowerCase('Latin 00D0 UTF8LowerCase', '', 'ÐÑÒÓÔÕÖרÙÚÛÜÝÞß', 'ðñòóôõö×øùúûüýþß');
AssertStringOperationUTF8LowerCase('Latin 00E0 UTF8LowerCase', '', 'àáâãäåæçèéêëìíîï', 'àáâãäåæçèéêëìíîï');
AssertStringOperationUTF8LowerCase('Latin 00F0 UTF8LowerCase', '', 'ðñòóôõö÷øùúûüýþÿ', 'ðñòóôõö÷øùúûüýþÿ');
AssertStringOperationUTF8LowerCase('Latin 0100 UTF8LowerCase', '', 'Āā Ăă Ąą Ćć Ĉĉ Ċċ Čč Ďď', 'āā ăă ąą ćć ĉĉ ċċ čč ďď');
AssertStringOperationUTF8LowerCase('Latin 0110 UTF8LowerCase', '', 'ĐđĒēĔĕĖėĘęĚěĜĝĞğ', 'đđēēĕĕėėęęěěĝĝğğ');
AssertStringOperationUTF8LowerCase('Latin 0120 UTF8LowerCase', '', 'ĠġĢģĤĥĦħĨĩĪīĬĭĮį', 'ġġģģĥĥħħĩĩīīĭĭįį');
AssertStringOperationUTF8LowerCase('Latin 0130 UTF8LowerCase', '', 'İıIJijĴĵĶķĸĹĺĻļĽľĿ', 'iıijijĵĵķķĸĺĺļļľľŀ');
AssertStringOperationUTF8LowerCase('Latin 0140 UTF8LowerCase', '', 'ŀŁłŃńŅņŇňʼnŊŋŌōŎŏ', 'ŀłłńńņņňňʼnŋŋōōŏŏ');
AssertStringOperationUTF8LowerCase('Latin 0150 UTF8LowerCase', '', 'ŐőŒœŔŕŖŗŘřŚśŜŝŞş', 'őőœœŕŕŗŗřřśśŝŝşş');
AssertStringOperationUTF8LowerCase('Latin 0160 UTF8LowerCase', '', 'ŠšŢţŤťŦŧŨũŪūŬŭŮů', 'ššţţťťŧŧũũūūŭŭůů');
AssertStringOperationUTF8LowerCase('Latin 0170 UTF8LowerCase', '', 'ŰűŲųŴŵŶŷŸŹźŻżŽžſ', 'űűųųŵŵŷŷÿźźżżžžſ');
AssertStringOperationUTF8LowerCase('Latin 0180 UTF8LowerCase', '', 'ƀ Ɓ Ƃƃ Ƅƅ Ɔ Ƈƈ Ɖ Ɗ Ƌƌ ƍ Ǝ Ə', 'ƀ ɓ ƃƃ ƅƅ ɔ ƈƈ ɖ ɗ ƌƌ ƍ ǝ ə');
AssertStringOperationUTF8LowerCase('Latin 0190 UTF8LowerCase', '', 'ƐƑƒƓƔƕƖƗƘƙƚƛƜƝƞƟ', 'ɛƒƒɠɣƕɩɨƙƙƚƛɯɲƞɵ');
AssertStringOperationUTF8LowerCase('Latin 01A0 UTF8LowerCase', '', 'ƠơƢƣƤƥƦƧƨƩƪƫƬƭƮƯ', 'ơơƣƣƥƥƦƨƨʃƪƫƭƭʈư');
AssertStringOperationUTF8LowerCase('Latin 01A0 UTF8LowerCase', '', 'ƠơƢƣƤƥƦƧƨƩƪƫƬƭƮƯ', 'ơơƣƣƥƥʀƨƨʃƪƫƭƭʈư');
AssertStringOperationUTF8LowerCase('Latin 01B0 UTF8LowerCase', '', 'ưƱƲƳƴƵƶƷƸƹƺƻƼƽƾƿ', 'ưʊʋƴƴƶƶʒƹƹƺƻƽƽƾƿ');
AssertStringOperationUTF8LowerCase('Latin 01C0 UTF8LowerCase', '', 'ǀǁǂǃDŽDždžLJLjljNJNjnjǍǎǏ', 'ǀǁǂǃdždždžljljljnjnjnjǎǎǐ');
AssertStringOperationUTF8LowerCase('Latin 0200 UTF8LowerCase', '', 'ȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏ', 'ȁȁȃȃȅȅȇȇȉȉȋȋȍȍȏȏ');
AssertStringOperationUTF8LowerCase('Latin 0210 UTF8LowerCase', '', 'ȐȑȒȓȔȕȖȗȘșȚțȜȝȞȟ', 'ȑȑȓȓȕȕȗȗșșțțȝȝȟȟ');
AssertStringOperationUTF8LowerCase('Latin 0220 UTF8LowerCase', '', 'ȠȡȢȣȤȥȦȧȨȩȪȫȬȭȮȯ', 'ƞȡȣȣȥȥȧȧȩȩȫȫȭȭȯȯ');
AssertStringOperationUTF8LowerCase('Latin 0230 UTF8LowerCase', '', 'ȰȱȲȳȴȵȶȷȸȹȺȻȼȽȾȿ', 'ȱȱȳȳȴȵȶȷȸȹⱥȼȼƚⱦȿ');
AssertStringOperationUTF8LowerCase('Latin 0240 UTF8LowerCase', '', 'ɀɁɂɃɄɅɆɇɈɉɊɋɌɍɎɏ', 'ɀɂɂƀʉʌɇɇɉɉɋɋɍɍɏɏ');
AssertStringOperationUTF8LowerCase('Latin 0250 UTF8LowerCase', '', 'ɐɑɒɓɔɕɖɗɘəɚɛɜɝɞɟ', 'ɐɑɒɓɔɕɖɗɘəɚɛɜɝɞɟ');
AssertStringOperationUTF8LowerCase('Unicode 0400 UTF8LowerCase', '', 'ЀЁЂЃЄЅІЇЈЉЊЋЌЍЎЏ', 'ѐёђѓєѕіїјљњћќѝўџ');
AssertStringOperationUTF8LowerCase('Unicode 0410 UTF8LowerCase', '', 'АБВГДЕЖЗИЙКЛМНОП', 'абвгдежзийклмноп');
AssertStringOperationUTF8LowerCase('Unicode 0420 UTF8LowerCase', '', 'РСТУФХЦЧШЩЪЫЬЭЮЯ', 'рстуфхцчшщъыьэюя');
AssertStringOperationUTF8LowerCase('Unicode 0430 UTF8LowerCase', '', 'абвгдежзийклмноп', 'абвгдежзийклмноп');
AssertStringOperationUTF8LowerCase('Unicode 0440 UTF8LowerCase', '', 'рстуфхцчшщъыьэюя', 'рстуфхцчшщъыьэюя');
AssertStringOperationUTF8LowerCase('Unicode 0450 UTF8LowerCase', '', 'ѐёђѓєѕіїјљњћќѝўџ', 'ѐёђѓєѕіїјљњћќѝўџ');
AssertStringOperationUTF8LowerCase('Unicode 0460 UTF8LowerCase', '', 'ѠѡѢѣѤѥѦѧѨѩѪѫѬѭѮѯ', 'ѡѡѣѣѥѥѧѧѩѩѫѫѭѭѯѯ');
AssertStringOperationUTF8LowerCase('Unicode 0470 UTF8LowerCase', '', 'ѰѱѲѳѴѵѶѷѸѹѺѻѼѽѾѿ', 'ѱѱѳѳѵѵѷѷѹѹѻѻѽѽѿѿ');
AssertStringOperationUTF8LowerCase('Unicode 0480 UTF8LowerCase', '', 'Ҁҁ҂ ҃ ҄ ҅ ҆ ҇ ҈ ҉ҊҋҌҍҎҏ', 'ҁҁ҂ ҃ ҄ ҅ ҆ ҇ ҈ ҉ҋҋҍҍҏҏ');
AssertStringOperationUTF8LowerCase('Unicode 0490 UTF8LowerCase', '', 'ҐґҒғҔҕҖҗҘҙҚқҜҝҞҟ', 'ґґғғҕҕҗҗҙҙққҝҝҟҟ');
AssertStringOperationUTF8LowerCase('Unicode 04A0 UTF8LowerCase', '', 'ҠҡҢңҤҥҦҧҨҩҪҫҬҭҮү', 'ҡҡңңҥҥҧҧҩҩҫҫҭҭүү');
AssertStringOperationUTF8LowerCase('Unicode 04B0 UTF8LowerCase', '', 'ҰұҲҳҴҵҶҷҸҹҺһҼҽҾҿ', 'ұұҳҳҵҵҷҷҹҹһһҽҽҿҿ');
AssertStringOperationUTF8LowerCase('Unicode 04C0 UTF8LowerCase', '', 'ӀӁӂӃӄӅӆӇӈӉӊӋӌӍӎӏ', 'ӏӂӂӄӄӆӆӈӈӊӊӌӌӎӎӏ');
AssertStringOperationUTF8LowerCase('Unicode 04D0 UTF8LowerCase', '', 'ӐӑӒӓӔӕӖӗӘәӚӛӜӝӞӟ', 'ӑӑӓӓӕӕӗӗәәӛӛӝӝӟӟ');
AssertStringOperationUTF8LowerCase('Unicode 04E0 UTF8LowerCase', '', 'ӠӡӢӣӤӥӦӧӨөӪӫӬӭӮӯ', 'ӡӡӣӣӥӥӧӧөөӫӫӭӭӯӯ');
AssertStringOperationUTF8LowerCase('Unicode 04F0 UTF8LowerCase', '', 'ӰӱӲӳӴӵӶӷӸӹӺӻӼӽӾӿ', 'ӱӱӳӳӵӵӷӷӹӹӻӻӽӽӿӿ');
AssertStringOperationUTF8LowerCase('Unicode 0500 UTF8LowerCase', '', 'ԀԁԂԃԄԅԆԇԈԉԊԋԌԍԎԏ', 'ԁԁԃԃԅԅԇԇԉԉԋԋԍԍԏԏ');
AssertStringOperationUTF8LowerCase('Unicode 0510 UTF8LowerCase', '', 'ԐԑԒԓԔԕԖԗԘԙԚԛԜԝԞԟ', 'ԑԑԓԓԕԕԗԗԙԙԛԛԝԝԟԟ');
AssertStringOperationUTF8LowerCase('Unicode 0520 UTF8LowerCase', '', 'ԠԡԢԣԤԥԦԧ', 'ԡԡԣԣԥԥԧԧ');
// Armenian Unicode Table
AssertStringOperationUTF8LowerCase('Unicode 0530 UTF8LowerCase', '', 'ԱԲԳԴԵԶԷԸԹԺԻԼԽԾԿ', 'աբգդեզէըթժիլխծկ');
AssertStringOperationUTF8LowerCase('Unicode 0540 UTF8LowerCase', '', 'ՀՁՂՃՄՅՆՇՈՉՊՋՌՍՎՏ', 'հձղճմյնշոչպջռսվտ');
AssertStringOperationUTF8LowerCase('Unicode 0550 UTF8LowerCase', '', 'ՐՑՒՓՔՕՖ', 'րցւփքօֆ');
AssertStringOperationUTF8LowerCase('Unicode 0560 UTF8LowerCase', '', 'աբգդեզէըթժիլխծկ', 'աբգդեզէըթժիլխծկ');
AssertStringOperationUTF8LowerCase('Unicode 0570 UTF8LowerCase', '', 'հձղճմյնշոչպջռսվտ', 'հձղճմյնշոչպջռսվտ');
AssertStringOperationUTF8LowerCase('Unicode 0580 UTF8LowerCase', '', 'րցւփքօֆ', 'րցւփքօֆ');
// Higher Unicode Table
AssertStringOperationUTF8LowerCase('Unicode 1E00 UTF8LowerCase', '', 'ḀḁḂḃḄḅḆḇḈḉḊḋḌḍḎḏ', 'ḁḁḃḃḅḅḇḇḉḉḋḋḍḍḏḏ');
// Turkish
AssertStringOperationUTF8LowerCase('Turkish UTF8LowerCase 1', 'tu', 'abcçdefgğhııijklmnoöprsştuüvyz', 'abcçdefgğhııijklmnoöprsştuüvyz');
AssertStringOperationUTF8LowerCase('Turkish UTF8LowerCase 2', 'tu', 'ABCÇDEFGĞHIIİJKLMNOÖPRSŞTUÜVYZ', 'abcçdefgğhııijklmnoöprsştuüvyz');