Lowercase: Adds Georgian support

git-svn-id: trunk@32854 -
This commit is contained in:
sekelsenmat 2011-10-12 13:51:33 +00:00
parent a617d46d26
commit 5d79eb6782
2 changed files with 77 additions and 35 deletions

View File

@ -68,8 +68,7 @@ function UnicodeLowercase(u: cardinal): cardinal;
function UTF8LowerCaseMattias(const s: utf8string): utf8string;
{$endif}
function UTF8LowerCase(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
function UTF8UpperCase(const AInStr: utf8string): utf8string;
function UTF8UpperCase(const AInStr, ALocale: utf8string): utf8string;
function UTF8UpperCase(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
{function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
// StopOnNonASCII: Boolean = false): PtrInt;
//function ValidUTF8String(const s: String): String;
@ -1168,6 +1167,24 @@ end;
{
AInStr - The input string
ALocale - The locale. Use '' for maximum speed if one desires to ignore the locale
Data from here: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
List of ranges which have lowercase:
$0041..$0061 ASCII
$00C0..$00DE: Result:=UnicodeLower00C0_00DE[u];
$0100..$024E: Result:=UnicodeLower0100_024E[u];
$0386..$03AB: Result:=UnicodeLower0386_03AB[u];
$03D8..$042F: Result:=UnicodeLower03D8_042F[u];
$0460..$0512: Result:=UnicodeLower0460_0512[u];
$0531..$0556: Result:=u+48;
$10A0..$10C5 Georgian
$1E00..$1FFC: Result:=UnicodeLower1E00_1FFC[u];
$2126..$2183: Result:=UnicodeLower2126_2183[u];
$24B6..$24CF: Result:=u+26;
$2C00..$2C2E: Result:=u+48;
$2C60..$2CE2: Result:=UnicodeLower2C60_2CE2[u];
$FF21..$FF3A: Result:=u+32;
}
function UTF8LowerCase(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
var
@ -1175,7 +1192,7 @@ var
InStr, InStrEnd, OutStr: PChar;
// Language identification
IsTurkish: Boolean;
c: Char;
c, c2: Char;
begin
Result:=AInStr;
InStr := PChar(AInStr);
@ -1187,7 +1204,7 @@ begin
begin
c := InStr^;
case c of
'A'..'Z',#$C3, #$C4, #$C5..#$C8, #$CE, #$D0..#$D2: Break;
'A'..'Z',#$C3, #$C4, #$C5..#$C8, #$CE, #$D0..#$D2,#$E1: Break;
// already lower, or otherwhise not affected
else
inc(InStr);
@ -1474,6 +1491,38 @@ begin
inc(InStr, 2);
inc(OutStr, 2);
end;
// Georgian codepoints 10A0-10C5 => 2D00-2D25
// In UTF-8 this is:
// E1 82 A0 - E1 82 BF => E2 B4 80 - E2 B4 9F
// E1 83 80 - E1 83 85 => E2 B4 A0 - E2 B4 A5
#$E1:
begin
c := InStr[1];
c2 := InStr[2];
if (c = #$82) and (c2 in [#$A0..#$BF]) then
begin
OutStr^ := #$E2;
OutStr[1] := #$B4;
OutStr[2] := chr(ord(c2) - $20);
end
else if (c = #$83) and (c2 in [#$80..#$85]) then
begin
OutStr^ := #$E2;
OutStr[1] := #$B4;
OutStr[2] := chr(ord(c2) + $20);
end
else
begin
if (CounterDiff <> 0) then
begin
OutStr^ := InStr[0];
OutStr[1] := InStr[1];
OutStr[2] := InStr[2];
end;
end;
inc(InStr, 3);
inc(OutStr, 3);
end;
else
// Copy the character if the string was disaligned by previous changes
if (CounterDiff <> 0) then OutStr^:=c;
@ -1486,16 +1535,11 @@ begin
SetLength(Result,OutStr - PChar(Result));
end;
function UTF8UpperCase(const AInStr: utf8string): utf8string;
begin
Result := UTF8UpperCase(AInStr, '');
end;
{
AInStr - The input string
ALocale - The locale. Use '' for maximum speed if one desires to ignore the locale
}
function UTF8UpperCase(const AInStr, ALocale: utf8string): utf8string;
function UTF8UpperCase(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
var
i, InCounter, OutCounter: PtrInt;
OutStr: PChar;
@ -1640,7 +1684,7 @@ var
c: Char;
begin
for c:=Low(char) to High(char) do begin
FPUpChars[c]:=upcase(c);
FPUpChars[c]:=(c);
end;
end;

View File

@ -113,11 +113,7 @@ begin
// Turkish
AssertStringOperationUTF8LowerCase('Turkish UTF8LowerCase 1', 'tu', 'abcçdefgğhııijklmnoöprsştuüvyz', 'abcçdefgğhııijklmnoöprsştuüvyz');
AssertStringOperationUTF8LowerCase('Turkish UTF8LowerCase 2', 'tu', 'ABCÇDEFGĞHIIİJKLMNOÖPRSŞTUÜVYZ', 'abcçdefgğhııijklmnoöprsştuüvyz');
AssertStringOperationUTF8LowerCase('Turkish UTF8LowerCase 1', 'tu',
'AhıIxXa',
'ahııxxa');
AssertStringOperationUTF8LowerCase('Turkish UTF8LowerCase 3', 'tu', 'AhıIxXa', 'ahııxxa');
// Cyrillic
AssertStringOperationUTF8LowerCase('Russian UTF8LowerCase 1', '', 'АБВЕЁЖЗКЛМНОПРДЙГ СУФХЦЧШЩЪЫЬЭЮЯИТ', 'абвеёжзклмнопрдйг суфхцчшщъыьэюяит');
AssertStringOperationUTF8LowerCase('Russian UTF8LowerCase 2', '', 'абвеёжзклмнопрдйг суфхцчшщъыьэюяит', 'абвеёжзклмнопрдйг суфхцчшщъыьэюяит');
@ -125,31 +121,33 @@ begin
AssertStringOperationUTF8LowerCase('Cyrillic UTF8UpperCase 2', '', 'Ҋҋ Ҍҍ Ҏҏ Ґґ Ғғ Ҕҕ Җҗ Ҙҙ Ққ Ҝҝ Ҟҟ Ҡҡ Ңң Ҥҥ Ҧҧ Ҩҩ Ҫҫ Ҭҭ Үү Ұұ Ҳҳ Ҵҵ Ҷҷ Ҹҹ Һһ Ҽҽ Ҿҿ', 'ҋҋ ҍҍ ҏҏ ґґ ғғ ҕҕ җҗ ҙҙ ққ ҝҝ ҟҟ ҡҡ ңң ҥҥ ҧҧ ҩҩ ҫҫ ҭҭ үү ұұ ҳҳ ҵҵ ҷҷ ҹҹ һһ ҽҽ ҿҿ');
// What shouldnt change
AssertStringOperationUTF8LowerCase('Chinese UTF8LowerCase 1', '', '名字叫嘉英,嘉陵江的嘉,英國的英', '名字叫嘉英,嘉陵江的嘉,英國的英');
// Georgian
AssertStringOperationUTF8LowerCase('Georgian UTF8LowerCase 1', '', 'Ⴀⴀ Ⴁⴁ Ⴂⴂ Ⴃⴃ Ⴄⴄ Ⴅⴅ Ⴆⴆ Ⴇⴇ Ⴈⴈ Ⴉⴉ Ⴊⴊ Ⴋⴋ Ⴌⴌ Ⴍⴍ Ⴎⴎ Ⴏⴏ Ⴐⴐ Ⴑⴑ', 'ⴀⴀ ⴁⴁ ⴂⴂ ⴃⴃ ⴄⴄ ⴅⴅ ⴆⴆ ⴇⴇ ⴈⴈ ⴉⴉ ⴊⴊ ⴋⴋ ⴌⴌ ⴍⴍ ⴎⴎ ⴏⴏ ⴐⴐ ⴑⴑ');
AssertStringOperationUTF8LowerCase('Georgian UTF8LowerCase 2', '', 'Ⴒⴒ Ⴓⴓ Ⴔⴔ Ⴕⴕ Ⴖⴖ Ⴗⴗ Ⴘⴘ Ⴙⴙ Ⴚⴚ Ⴛⴛ Ⴜⴜ Ⴝⴝ Ⴞⴞ Ⴟⴟ Ⴠⴠ Ⴡⴡ Ⴢⴢ Ⴣⴣ Ⴤⴤ Ⴥⴥ', 'ⴒⴒ ⴓⴓ ⴔⴔ ⴕⴕ ⴖⴖ ⴗⴗ ⴘⴘ ⴙⴙ ⴚⴚ ⴛⴛ ⴜⴜ ⴝⴝ ⴞⴞ ⴟⴟ ⴠⴠ ⴡⴡ ⴢⴢ ⴣⴣ ⴤⴤ ⴥⴥ');
// repeat all tests with leading turkish i, to force offset
// repeat all tests with leading turkish i, to force offset
// ASCII
AssertStringOperationUTF8LowerCase('X ASCII UTF8LowerCase', 'tu', 'IABCDEFGHIJKLMNOPQRSTUWVXYZ', 'ıabcdefghıjklmnopqrstuwvxyz');
AssertStringOperationUTF8LowerCase('Offset ASCII UTF8LowerCase', 'tu', 'IABCDEFGHIJKLMNOPQRSTUWVXYZ', 'ıabcdefghıjklmnopqrstuwvxyz');
// Latin
AssertStringOperationUTF8LowerCase('X Portuguese UTF8LowerCase 1', 'tu', 'IÇ/ç Ã/ã Õ/õ Á/á É/é Í/í Ó/ó Ú/ú Ü/ü À/à Â/â Ê/ê Î/î Ô/ô Û/û', 'ıç/ç ã/ã õ/õ á/á é/é í/í ó/ó ú/ú ü/ü à/à â/â ê/ê î/î ô/ô û/û');
AssertStringOperationUTF8LowerCase('X French UTF8LowerCase 1', 'tu', 'IÀ/à Â/â æ Ç/ç É/é È/è Ê/ê Ë/ë Î/î Ï/ï Ô/ô œ Ù/ù Û/û Ü/ü Ÿ/ÿ', 'ıà/à â/â æ ç/ç é/é è/è ê/ê ë/ë î/î ï/ï ô/ô œ ù/ù û/û ü/ü ÿ/ÿ');
AssertStringOperationUTF8LowerCase('X Polish UTF8LowerCase 1', 'tu', 'Iaąbcćdeęfghijklłmnńoóprsśtuwyzźż', 'ıaąbcćdeęfghijklłmnńoóprsśtuwyzźż');
AssertStringOperationUTF8LowerCase('X Polish UTF8LowerCase 2', 'tu', 'IAĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ', 'ıaąbcćdeęfghıjklłmnńoóprsśtuwyzźż');
AssertStringOperationUTF8LowerCase('X German UTF8LowerCase 1', 'tu', 'IÄ/ä,Ö/ö,Ü/ü,ß', 'ıä/ä,ö/ö,ü/ü,ß');
AssertStringOperationUTF8LowerCase('Offset Portuguese UTF8LowerCase 1', 'tu', 'IÇ/ç Ã/ã Õ/õ Á/á É/é Í/í Ó/ó Ú/ú Ü/ü À/à Â/â Ê/ê Î/î Ô/ô Û/û', 'ıç/ç ã/ã õ/õ á/á é/é í/í ó/ó ú/ú ü/ü à/à â/â ê/ê î/î ô/ô û/û');
AssertStringOperationUTF8LowerCase('Offset French UTF8LowerCase 1', 'tu', 'IÀ/à Â/â æ Ç/ç É/é È/è Ê/ê Ë/ë Î/î Ï/ï Ô/ô œ Ù/ù Û/û Ü/ü Ÿ/ÿ', 'ıà/à â/â æ ç/ç é/é è/è ê/ê ë/ë î/î ï/ï ô/ô œ ù/ù û/û ü/ü ÿ/ÿ');
AssertStringOperationUTF8LowerCase('Offset Polish UTF8LowerCase 1', 'tu', 'Iaąbcćdeęfghijklłmnńoóprsśtuwyzźż', 'ıaąbcćdeęfghijklłmnńoóprsśtuwyzźż');
AssertStringOperationUTF8LowerCase('Offset Polish UTF8LowerCase 2', 'tu', 'IAĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ', 'ıaąbcćdeęfghıjklłmnńoóprsśtuwyzźż');
AssertStringOperationUTF8LowerCase('Offset German UTF8LowerCase 1', 'tu', 'IÄ/ä,Ö/ö,Ü/ü,ß', 'ıä/ä,ö/ö,ü/ü,ß');
// Turkish
AssertStringOperationUTF8LowerCase('X Turkish UTF8LowerCase 1', 'tu', 'Iabcçdefgğhııijklmnoöprsştuüvyz', 'ıabcçdefgğhııijklmnoöprsştuüvyz');
AssertStringOperationUTF8LowerCase('X Turkish UTF8LowerCase 2', 'tu', 'IABCÇDEFGĞHIIİJKLMNOÖPRSŞTUÜVYZ', 'ıabcçdefgğhııijklmnoöprsştuüvyz');
AssertStringOperationUTF8LowerCase('X Turkish UTF8LowerCase 1', 'tu',
'IAhıIxXa',
'ıahııxxa');
AssertStringOperationUTF8LowerCase('Offset Turkish UTF8LowerCase 1', 'tu', 'Iabcçdefgğhııijklmnoöprsştuüvyz', 'ıabcçdefgğhııijklmnoöprsştuüvyz');
AssertStringOperationUTF8LowerCase('Offset Turkish UTF8LowerCase 2', 'tu', 'IABCÇDEFGĞHIIİJKLMNOÖPRSŞTUÜVYZ', 'ıabcçdefgğhııijklmnoöprsştuüvyz');
AssertStringOperationUTF8LowerCase('Offset Turkish UTF8LowerCase 1', 'tu', 'IAhıIxXa', 'ıahııxxa');
// Cyrillic
AssertStringOperationUTF8LowerCase('X Russian UTF8LowerCase 1', 'tu', 'IАБВЕЁЖЗКЛМНОПРДЙГ СУФХЦЧШЩЪЫЬЭЮЯИТ', 'ıабвеёжзклмнопрдйг суфхцчшщъыьэюяит');
AssertStringOperationUTF8LowerCase('X Russian UTF8LowerCase 2', 'tu', 'Iабвеёжзклмнопрдйг суфхцчшщъыьэюяит', 'ıабвеёжзклмнопрдйг суфхцчшщъыьэюяит');
AssertStringOperationUTF8LowerCase('X Cyrillic UTF8UpperCase 1', 'tu', 'IѠѡ Ѣѣ Ѥѥ Ѧѧ Ѩѩ Ѫѫ Ѭѭ Ѯѯ Ѱѱ Ѳѳ Ѵѵ Ѷѷ Ѹѹ Ѻѻ Ѽѽ Ѿѿ Ҁҁ', 'ıѡѡ ѣѣ ѥѥ ѧѧ ѩѩ ѫѫ ѭѭ ѯѯ ѱѱ ѳѳ ѵѵ ѷѷ ѹѹ ѻѻ ѽѽ ѿѿ ҁҁ');
AssertStringOperationUTF8LowerCase('X Cyrillic UTF8UpperCase 2', 'tu', 'IҊҋ Ҍҍ Ҏҏ Ґґ Ғғ Ҕҕ Җҗ Ҙҙ Ққ Ҝҝ Ҟҟ Ҡҡ Ңң Ҥҥ Ҧҧ Ҩҩ Ҫҫ Ҭҭ Үү Ұұ Ҳҳ Ҵҵ Ҷҷ Ҹҹ Һһ Ҽҽ Ҿҿ', 'ıҋҋ ҍҍ ҏҏ ґґ ғғ ҕҕ җҗ ҙҙ ққ ҝҝ ҟҟ ҡҡ ңң ҥҥ ҧҧ ҩҩ ҫҫ ҭҭ үү ұұ ҳҳ ҵҵ ҷҷ ҹҹ һһ ҽҽ ҿҿ');
AssertStringOperationUTF8LowerCase('Offset Russian UTF8LowerCase 1', 'tu', 'IАБВЕЁЖЗКЛМНОПРДЙГ СУФХЦЧШЩЪЫЬЭЮЯИТ', 'ıабвеёжзклмнопрдйг суфхцчшщъыьэюяит');
AssertStringOperationUTF8LowerCase('Offset Russian UTF8LowerCase 2', 'tu', 'Iабвеёжзклмнопрдйг суфхцчшщъыьэюяит', 'ıабвеёжзклмнопрдйг суфхцчшщъыьэюяит');
AssertStringOperationUTF8LowerCase('Offset Cyrillic UTF8UpperCase 1', 'tu', 'IѠѡ Ѣѣ Ѥѥ Ѧѧ Ѩѩ Ѫѫ Ѭѭ Ѯѯ Ѱѱ Ѳѳ Ѵѵ Ѷѷ Ѹѹ Ѻѻ Ѽѽ Ѿѿ Ҁҁ', 'ıѡѡ ѣѣ ѥѥ ѧѧ ѩѩ ѫѫ ѭѭ ѯѯ ѱѱ ѳѳ ѵѵ ѷѷ ѹѹ ѻѻ ѽѽ ѿѿ ҁҁ');
AssertStringOperationUTF8LowerCase('Offset Cyrillic UTF8UpperCase 2', 'tu', 'IҊҋ Ҍҍ Ҏҏ Ґґ Ғғ Ҕҕ Җҗ Ҙҙ Ққ Ҝҝ Ҟҟ Ҡҡ Ңң Ҥҥ Ҧҧ Ҩҩ Ҫҫ Ҭҭ Үү Ұұ Ҳҳ Ҵҵ Ҷҷ Ҹҹ Һһ Ҽҽ Ҿҿ', 'ıҋҋ ҍҍ ҏҏ ґґ ғғ ҕҕ җҗ ҙҙ ққ ҝҝ ҟҟ ҡҡ ңң ҥҥ ҧҧ ҩҩ ҫҫ ҭҭ үү ұұ ҳҳ ҵҵ ҷҷ ҹҹ һһ ҽҽ ҿҿ');
// What shouldnt change
AssertStringOperationUTF8LowerCase('X Chinese UTF8LowerCase 1', 'tu', 'I名字叫嘉英嘉陵江的嘉英國的英', 'ı名字叫嘉英,嘉陵江的嘉,英國的英');
AssertStringOperationUTF8LowerCase('Offset Chinese UTF8LowerCase 1', 'tu', 'I名字叫嘉英嘉陵江的嘉英國的英', 'ı名字叫嘉英,嘉陵江的嘉,英國的英');
// Georgian
AssertStringOperationUTF8LowerCase('Offset Georgian UTF8LowerCase 1', 'tu', 'IႠⴀ Ⴁⴁ Ⴂⴂ Ⴃⴃ Ⴄⴄ Ⴅⴅ Ⴆⴆ Ⴇⴇ Ⴈⴈ Ⴉⴉ Ⴊⴊ Ⴋⴋ Ⴌⴌ Ⴍⴍ Ⴎⴎ Ⴏⴏ Ⴐⴐ Ⴑⴑ', 'ⴀⴀ ⴁⴁ ⴂⴂ ⴃⴃ ⴄⴄ ⴅⴅ ⴆⴆ ⴇⴇ ⴈⴈ ⴉⴉ ⴊⴊ ⴋⴋ ⴌⴌ ⴍⴍ ⴎⴎ ⴏⴏ ⴐⴐ ⴑⴑ');
AssertStringOperationUTF8LowerCase('Offset Georgian UTF8LowerCase 2', 'tu', 'IႲⴒ Ⴓⴓ Ⴔⴔ Ⴕⴕ Ⴖⴖ Ⴗⴗ Ⴘⴘ Ⴙⴙ Ⴚⴚ Ⴛⴛ Ⴜⴜ Ⴝⴝ Ⴞⴞ Ⴟⴟ Ⴠⴠ Ⴡⴡ Ⴢⴢ Ⴣⴣ Ⴤⴤ Ⴥⴥ', 'ⴒⴒ ⴓⴓ ⴔⴔ ⴕⴕ ⴖⴖ ⴗⴗ ⴘⴘ ⴙⴙ ⴚⴚ ⴛⴛ ⴜⴜ ⴝⴝ ⴞⴞ ⴟⴟ ⴠⴠ ⴡⴡ ⴢⴢ ⴣⴣ ⴤⴤ ⴥⴥ');
// Performance test
Write('Mattias LowerCase- Performance test took: ');