mirror of
https://gitlab.com/freepascal.org/lazarus/lazarus.git
synced 2025-12-16 01:00:34 +01:00
Lowercase: Adds Georgian support
git-svn-id: trunk@32854 -
This commit is contained in:
parent
a617d46d26
commit
5d79eb6782
@ -68,8 +68,7 @@ function UnicodeLowercase(u: cardinal): cardinal;
|
||||
function UTF8LowerCaseMattias(const s: utf8string): utf8string;
|
||||
{$endif}
|
||||
function UTF8LowerCase(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
|
||||
function UTF8UpperCase(const AInStr: utf8string): utf8string;
|
||||
function UTF8UpperCase(const AInStr, ALocale: utf8string): utf8string;
|
||||
function UTF8UpperCase(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
|
||||
{function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
|
||||
// StopOnNonASCII: Boolean = false): PtrInt;
|
||||
//function ValidUTF8String(const s: String): String;
|
||||
@ -1168,6 +1167,24 @@ end;
|
||||
{
|
||||
AInStr - The input string
|
||||
ALocale - The locale. Use '' for maximum speed if one desires to ignore the locale
|
||||
|
||||
Data from here: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
|
||||
|
||||
List of ranges which have lowercase:
|
||||
$0041..$0061 ASCII
|
||||
$00C0..$00DE: Result:=UnicodeLower00C0_00DE[u];
|
||||
$0100..$024E: Result:=UnicodeLower0100_024E[u];
|
||||
$0386..$03AB: Result:=UnicodeLower0386_03AB[u];
|
||||
$03D8..$042F: Result:=UnicodeLower03D8_042F[u];
|
||||
$0460..$0512: Result:=UnicodeLower0460_0512[u];
|
||||
$0531..$0556: Result:=u+48;
|
||||
$10A0..$10C5 Georgian
|
||||
$1E00..$1FFC: Result:=UnicodeLower1E00_1FFC[u];
|
||||
$2126..$2183: Result:=UnicodeLower2126_2183[u];
|
||||
$24B6..$24CF: Result:=u+26;
|
||||
$2C00..$2C2E: Result:=u+48;
|
||||
$2C60..$2CE2: Result:=UnicodeLower2C60_2CE2[u];
|
||||
$FF21..$FF3A: Result:=u+32;
|
||||
}
|
||||
function UTF8LowerCase(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
|
||||
var
|
||||
@ -1175,7 +1192,7 @@ var
|
||||
InStr, InStrEnd, OutStr: PChar;
|
||||
// Language identification
|
||||
IsTurkish: Boolean;
|
||||
c: Char;
|
||||
c, c2: Char;
|
||||
begin
|
||||
Result:=AInStr;
|
||||
InStr := PChar(AInStr);
|
||||
@ -1187,7 +1204,7 @@ begin
|
||||
begin
|
||||
c := InStr^;
|
||||
case c of
|
||||
'A'..'Z',#$C3, #$C4, #$C5..#$C8, #$CE, #$D0..#$D2: Break;
|
||||
'A'..'Z',#$C3, #$C4, #$C5..#$C8, #$CE, #$D0..#$D2,#$E1: Break;
|
||||
// already lower, or otherwhise not affected
|
||||
else
|
||||
inc(InStr);
|
||||
@ -1474,6 +1491,38 @@ begin
|
||||
inc(InStr, 2);
|
||||
inc(OutStr, 2);
|
||||
end;
|
||||
// Georgian codepoints 10A0-10C5 => 2D00-2D25
|
||||
// In UTF-8 this is:
|
||||
// E1 82 A0 - E1 82 BF => E2 B4 80 - E2 B4 9F
|
||||
// E1 83 80 - E1 83 85 => E2 B4 A0 - E2 B4 A5
|
||||
#$E1:
|
||||
begin
|
||||
c := InStr[1];
|
||||
c2 := InStr[2];
|
||||
if (c = #$82) and (c2 in [#$A0..#$BF]) then
|
||||
begin
|
||||
OutStr^ := #$E2;
|
||||
OutStr[1] := #$B4;
|
||||
OutStr[2] := chr(ord(c2) - $20);
|
||||
end
|
||||
else if (c = #$83) and (c2 in [#$80..#$85]) then
|
||||
begin
|
||||
OutStr^ := #$E2;
|
||||
OutStr[1] := #$B4;
|
||||
OutStr[2] := chr(ord(c2) + $20);
|
||||
end
|
||||
else
|
||||
begin
|
||||
if (CounterDiff <> 0) then
|
||||
begin
|
||||
OutStr^ := InStr[0];
|
||||
OutStr[1] := InStr[1];
|
||||
OutStr[2] := InStr[2];
|
||||
end;
|
||||
end;
|
||||
inc(InStr, 3);
|
||||
inc(OutStr, 3);
|
||||
end;
|
||||
else
|
||||
// Copy the character if the string was disaligned by previous changes
|
||||
if (CounterDiff <> 0) then OutStr^:=c;
|
||||
@ -1486,16 +1535,11 @@ begin
|
||||
SetLength(Result,OutStr - PChar(Result));
|
||||
end;
|
||||
|
||||
function UTF8UpperCase(const AInStr: utf8string): utf8string;
|
||||
begin
|
||||
Result := UTF8UpperCase(AInStr, '');
|
||||
end;
|
||||
|
||||
{
|
||||
AInStr - The input string
|
||||
ALocale - The locale. Use '' for maximum speed if one desires to ignore the locale
|
||||
}
|
||||
function UTF8UpperCase(const AInStr, ALocale: utf8string): utf8string;
|
||||
function UTF8UpperCase(const AInStr: utf8string; ALocale: utf8string=''): utf8string;
|
||||
var
|
||||
i, InCounter, OutCounter: PtrInt;
|
||||
OutStr: PChar;
|
||||
@ -1640,7 +1684,7 @@ var
|
||||
c: Char;
|
||||
begin
|
||||
for c:=Low(char) to High(char) do begin
|
||||
FPUpChars[c]:=upcase(c);
|
||||
FPUpChars[c]:=(c);
|
||||
end;
|
||||
end;
|
||||
|
||||
|
||||
@ -113,11 +113,7 @@ begin
|
||||
// Turkish
|
||||
AssertStringOperationUTF8LowerCase('Turkish UTF8LowerCase 1', 'tu', 'abcçdefgğhııijklmnoöprsştuüvyz', 'abcçdefgğhııijklmnoöprsştuüvyz');
|
||||
AssertStringOperationUTF8LowerCase('Turkish UTF8LowerCase 2', 'tu', 'ABCÇDEFGĞHIIİJKLMNOÖPRSŞTUÜVYZ', 'abcçdefgğhııijklmnoöprsştuüvyz');
|
||||
|
||||
AssertStringOperationUTF8LowerCase('Turkish UTF8LowerCase 1', 'tu',
|
||||
'AhıIxXa',
|
||||
'ahııxxa');
|
||||
|
||||
AssertStringOperationUTF8LowerCase('Turkish UTF8LowerCase 3', 'tu', 'AhıIxXa', 'ahııxxa');
|
||||
// Cyrillic
|
||||
AssertStringOperationUTF8LowerCase('Russian UTF8LowerCase 1', '', 'АБВЕЁЖЗКЛМНОПРДЙГ СУФХЦЧШЩЪЫЬЭЮЯИТ', 'абвеёжзклмнопрдйг суфхцчшщъыьэюяит');
|
||||
AssertStringOperationUTF8LowerCase('Russian UTF8LowerCase 2', '', 'абвеёжзклмнопрдйг суфхцчшщъыьэюяит', 'абвеёжзклмнопрдйг суфхцчшщъыьэюяит');
|
||||
@ -125,31 +121,33 @@ begin
|
||||
AssertStringOperationUTF8LowerCase('Cyrillic UTF8UpperCase 2', '', 'Ҋҋ Ҍҍ Ҏҏ Ґґ Ғғ Ҕҕ Җҗ Ҙҙ Ққ Ҝҝ Ҟҟ Ҡҡ Ңң Ҥҥ Ҧҧ Ҩҩ Ҫҫ Ҭҭ Үү Ұұ Ҳҳ Ҵҵ Ҷҷ Ҹҹ Һһ Ҽҽ Ҿҿ', 'ҋҋ ҍҍ ҏҏ ґґ ғғ ҕҕ җҗ ҙҙ ққ ҝҝ ҟҟ ҡҡ ңң ҥҥ ҧҧ ҩҩ ҫҫ ҭҭ үү ұұ ҳҳ ҵҵ ҷҷ ҹҹ һһ ҽҽ ҿҿ');
|
||||
// What shouldnt change
|
||||
AssertStringOperationUTF8LowerCase('Chinese UTF8LowerCase 1', '', '名字叫嘉英,嘉陵江的嘉,英國的英', '名字叫嘉英,嘉陵江的嘉,英國的英');
|
||||
// Georgian
|
||||
AssertStringOperationUTF8LowerCase('Georgian UTF8LowerCase 1', '', 'Ⴀⴀ Ⴁⴁ Ⴂⴂ Ⴃⴃ Ⴄⴄ Ⴅⴅ Ⴆⴆ Ⴇⴇ Ⴈⴈ Ⴉⴉ Ⴊⴊ Ⴋⴋ Ⴌⴌ Ⴍⴍ Ⴎⴎ Ⴏⴏ Ⴐⴐ Ⴑⴑ', 'ⴀⴀ ⴁⴁ ⴂⴂ ⴃⴃ ⴄⴄ ⴅⴅ ⴆⴆ ⴇⴇ ⴈⴈ ⴉⴉ ⴊⴊ ⴋⴋ ⴌⴌ ⴍⴍ ⴎⴎ ⴏⴏ ⴐⴐ ⴑⴑ');
|
||||
AssertStringOperationUTF8LowerCase('Georgian UTF8LowerCase 2', '', 'Ⴒⴒ Ⴓⴓ Ⴔⴔ Ⴕⴕ Ⴖⴖ Ⴗⴗ Ⴘⴘ Ⴙⴙ Ⴚⴚ Ⴛⴛ Ⴜⴜ Ⴝⴝ Ⴞⴞ Ⴟⴟ Ⴠⴠ Ⴡⴡ Ⴢⴢ Ⴣⴣ Ⴤⴤ Ⴥⴥ', 'ⴒⴒ ⴓⴓ ⴔⴔ ⴕⴕ ⴖⴖ ⴗⴗ ⴘⴘ ⴙⴙ ⴚⴚ ⴛⴛ ⴜⴜ ⴝⴝ ⴞⴞ ⴟⴟ ⴠⴠ ⴡⴡ ⴢⴢ ⴣⴣ ⴤⴤ ⴥⴥ');
|
||||
|
||||
// repeat all tests with leading turkish i, to force offset
|
||||
// repeat all tests with leading turkish i, to force offset
|
||||
// ASCII
|
||||
AssertStringOperationUTF8LowerCase('X ASCII UTF8LowerCase', 'tu', 'IABCDEFGHIJKLMNOPQRSTUWVXYZ', 'ıabcdefghıjklmnopqrstuwvxyz');
|
||||
AssertStringOperationUTF8LowerCase('Offset ASCII UTF8LowerCase', 'tu', 'IABCDEFGHIJKLMNOPQRSTUWVXYZ', 'ıabcdefghıjklmnopqrstuwvxyz');
|
||||
// Latin
|
||||
AssertStringOperationUTF8LowerCase('X Portuguese UTF8LowerCase 1', 'tu', 'IÇ/ç Ã/ã Õ/õ Á/á É/é Í/í Ó/ó Ú/ú Ü/ü À/à Â/â Ê/ê Î/î Ô/ô Û/û', 'ıç/ç ã/ã õ/õ á/á é/é í/í ó/ó ú/ú ü/ü à/à â/â ê/ê î/î ô/ô û/û');
|
||||
AssertStringOperationUTF8LowerCase('X French UTF8LowerCase 1', 'tu', 'IÀ/à Â/â æ Ç/ç É/é È/è Ê/ê Ë/ë Î/î Ï/ï Ô/ô œ Ù/ù Û/û Ü/ü Ÿ/ÿ', 'ıà/à â/â æ ç/ç é/é è/è ê/ê ë/ë î/î ï/ï ô/ô œ ù/ù û/û ü/ü ÿ/ÿ');
|
||||
AssertStringOperationUTF8LowerCase('X Polish UTF8LowerCase 1', 'tu', 'Iaąbcćdeęfghijklłmnńoóprsśtuwyzźż', 'ıaąbcćdeęfghijklłmnńoóprsśtuwyzźż');
|
||||
AssertStringOperationUTF8LowerCase('X Polish UTF8LowerCase 2', 'tu', 'IAĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ', 'ıaąbcćdeęfghıjklłmnńoóprsśtuwyzźż');
|
||||
AssertStringOperationUTF8LowerCase('X German UTF8LowerCase 1', 'tu', 'IÄ/ä,Ö/ö,Ü/ü,ß', 'ıä/ä,ö/ö,ü/ü,ß');
|
||||
AssertStringOperationUTF8LowerCase('Offset Portuguese UTF8LowerCase 1', 'tu', 'IÇ/ç Ã/ã Õ/õ Á/á É/é Í/í Ó/ó Ú/ú Ü/ü À/à Â/â Ê/ê Î/î Ô/ô Û/û', 'ıç/ç ã/ã õ/õ á/á é/é í/í ó/ó ú/ú ü/ü à/à â/â ê/ê î/î ô/ô û/û');
|
||||
AssertStringOperationUTF8LowerCase('Offset French UTF8LowerCase 1', 'tu', 'IÀ/à Â/â æ Ç/ç É/é È/è Ê/ê Ë/ë Î/î Ï/ï Ô/ô œ Ù/ù Û/û Ü/ü Ÿ/ÿ', 'ıà/à â/â æ ç/ç é/é è/è ê/ê ë/ë î/î ï/ï ô/ô œ ù/ù û/û ü/ü ÿ/ÿ');
|
||||
AssertStringOperationUTF8LowerCase('Offset Polish UTF8LowerCase 1', 'tu', 'Iaąbcćdeęfghijklłmnńoóprsśtuwyzźż', 'ıaąbcćdeęfghijklłmnńoóprsśtuwyzźż');
|
||||
AssertStringOperationUTF8LowerCase('Offset Polish UTF8LowerCase 2', 'tu', 'IAĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ', 'ıaąbcćdeęfghıjklłmnńoóprsśtuwyzźż');
|
||||
AssertStringOperationUTF8LowerCase('Offset German UTF8LowerCase 1', 'tu', 'IÄ/ä,Ö/ö,Ü/ü,ß', 'ıä/ä,ö/ö,ü/ü,ß');
|
||||
// Turkish
|
||||
AssertStringOperationUTF8LowerCase('X Turkish UTF8LowerCase 1', 'tu', 'Iabcçdefgğhııijklmnoöprsştuüvyz', 'ıabcçdefgğhııijklmnoöprsştuüvyz');
|
||||
AssertStringOperationUTF8LowerCase('X Turkish UTF8LowerCase 2', 'tu', 'IABCÇDEFGĞHIIİJKLMNOÖPRSŞTUÜVYZ', 'ıabcçdefgğhııijklmnoöprsştuüvyz');
|
||||
|
||||
AssertStringOperationUTF8LowerCase('X Turkish UTF8LowerCase 1', 'tu',
|
||||
'IAhıIxXa',
|
||||
'ıahııxxa');
|
||||
|
||||
AssertStringOperationUTF8LowerCase('Offset Turkish UTF8LowerCase 1', 'tu', 'Iabcçdefgğhııijklmnoöprsştuüvyz', 'ıabcçdefgğhııijklmnoöprsştuüvyz');
|
||||
AssertStringOperationUTF8LowerCase('Offset Turkish UTF8LowerCase 2', 'tu', 'IABCÇDEFGĞHIIİJKLMNOÖPRSŞTUÜVYZ', 'ıabcçdefgğhııijklmnoöprsştuüvyz');
|
||||
AssertStringOperationUTF8LowerCase('Offset Turkish UTF8LowerCase 1', 'tu', 'IAhıIxXa', 'ıahııxxa');
|
||||
// Cyrillic
|
||||
AssertStringOperationUTF8LowerCase('X Russian UTF8LowerCase 1', 'tu', 'IАБВЕЁЖЗКЛМНОПРДЙГ СУФХЦЧШЩЪЫЬЭЮЯИТ', 'ıабвеёжзклмнопрдйг суфхцчшщъыьэюяит');
|
||||
AssertStringOperationUTF8LowerCase('X Russian UTF8LowerCase 2', 'tu', 'Iабвеёжзклмнопрдйг суфхцчшщъыьэюяит', 'ıабвеёжзклмнопрдйг суфхцчшщъыьэюяит');
|
||||
AssertStringOperationUTF8LowerCase('X Cyrillic UTF8UpperCase 1', 'tu', 'IѠѡ Ѣѣ Ѥѥ Ѧѧ Ѩѩ Ѫѫ Ѭѭ Ѯѯ Ѱѱ Ѳѳ Ѵѵ Ѷѷ Ѹѹ Ѻѻ Ѽѽ Ѿѿ Ҁҁ', 'ıѡѡ ѣѣ ѥѥ ѧѧ ѩѩ ѫѫ ѭѭ ѯѯ ѱѱ ѳѳ ѵѵ ѷѷ ѹѹ ѻѻ ѽѽ ѿѿ ҁҁ');
|
||||
AssertStringOperationUTF8LowerCase('X Cyrillic UTF8UpperCase 2', 'tu', 'IҊҋ Ҍҍ Ҏҏ Ґґ Ғғ Ҕҕ Җҗ Ҙҙ Ққ Ҝҝ Ҟҟ Ҡҡ Ңң Ҥҥ Ҧҧ Ҩҩ Ҫҫ Ҭҭ Үү Ұұ Ҳҳ Ҵҵ Ҷҷ Ҹҹ Һһ Ҽҽ Ҿҿ', 'ıҋҋ ҍҍ ҏҏ ґґ ғғ ҕҕ җҗ ҙҙ ққ ҝҝ ҟҟ ҡҡ ңң ҥҥ ҧҧ ҩҩ ҫҫ ҭҭ үү ұұ ҳҳ ҵҵ ҷҷ ҹҹ һһ ҽҽ ҿҿ');
|
||||
AssertStringOperationUTF8LowerCase('Offset Russian UTF8LowerCase 1', 'tu', 'IАБВЕЁЖЗКЛМНОПРДЙГ СУФХЦЧШЩЪЫЬЭЮЯИТ', 'ıабвеёжзклмнопрдйг суфхцчшщъыьэюяит');
|
||||
AssertStringOperationUTF8LowerCase('Offset Russian UTF8LowerCase 2', 'tu', 'Iабвеёжзклмнопрдйг суфхцчшщъыьэюяит', 'ıабвеёжзклмнопрдйг суфхцчшщъыьэюяит');
|
||||
AssertStringOperationUTF8LowerCase('Offset Cyrillic UTF8UpperCase 1', 'tu', 'IѠѡ Ѣѣ Ѥѥ Ѧѧ Ѩѩ Ѫѫ Ѭѭ Ѯѯ Ѱѱ Ѳѳ Ѵѵ Ѷѷ Ѹѹ Ѻѻ Ѽѽ Ѿѿ Ҁҁ', 'ıѡѡ ѣѣ ѥѥ ѧѧ ѩѩ ѫѫ ѭѭ ѯѯ ѱѱ ѳѳ ѵѵ ѷѷ ѹѹ ѻѻ ѽѽ ѿѿ ҁҁ');
|
||||
AssertStringOperationUTF8LowerCase('Offset Cyrillic UTF8UpperCase 2', 'tu', 'IҊҋ Ҍҍ Ҏҏ Ґґ Ғғ Ҕҕ Җҗ Ҙҙ Ққ Ҝҝ Ҟҟ Ҡҡ Ңң Ҥҥ Ҧҧ Ҩҩ Ҫҫ Ҭҭ Үү Ұұ Ҳҳ Ҵҵ Ҷҷ Ҹҹ Һһ Ҽҽ Ҿҿ', 'ıҋҋ ҍҍ ҏҏ ґґ ғғ ҕҕ җҗ ҙҙ ққ ҝҝ ҟҟ ҡҡ ңң ҥҥ ҧҧ ҩҩ ҫҫ ҭҭ үү ұұ ҳҳ ҵҵ ҷҷ ҹҹ һһ ҽҽ ҿҿ');
|
||||
// What shouldnt change
|
||||
AssertStringOperationUTF8LowerCase('X Chinese UTF8LowerCase 1', 'tu', 'I名字叫嘉英,嘉陵江的嘉,英國的英', 'ı名字叫嘉英,嘉陵江的嘉,英國的英');
|
||||
AssertStringOperationUTF8LowerCase('Offset Chinese UTF8LowerCase 1', 'tu', 'I名字叫嘉英,嘉陵江的嘉,英國的英', 'ı名字叫嘉英,嘉陵江的嘉,英國的英');
|
||||
// Georgian
|
||||
AssertStringOperationUTF8LowerCase('Offset Georgian UTF8LowerCase 1', 'tu', 'IႠⴀ Ⴁⴁ Ⴂⴂ Ⴃⴃ Ⴄⴄ Ⴅⴅ Ⴆⴆ Ⴇⴇ Ⴈⴈ Ⴉⴉ Ⴊⴊ Ⴋⴋ Ⴌⴌ Ⴍⴍ Ⴎⴎ Ⴏⴏ Ⴐⴐ Ⴑⴑ', 'ⴀⴀ ⴁⴁ ⴂⴂ ⴃⴃ ⴄⴄ ⴅⴅ ⴆⴆ ⴇⴇ ⴈⴈ ⴉⴉ ⴊⴊ ⴋⴋ ⴌⴌ ⴍⴍ ⴎⴎ ⴏⴏ ⴐⴐ ⴑⴑ');
|
||||
AssertStringOperationUTF8LowerCase('Offset Georgian UTF8LowerCase 2', 'tu', 'IႲⴒ Ⴓⴓ Ⴔⴔ Ⴕⴕ Ⴖⴖ Ⴗⴗ Ⴘⴘ Ⴙⴙ Ⴚⴚ Ⴛⴛ Ⴜⴜ Ⴝⴝ Ⴞⴞ Ⴟⴟ Ⴠⴠ Ⴡⴡ Ⴢⴢ Ⴣⴣ Ⴤⴤ Ⴥⴥ', 'ⴒⴒ ⴓⴓ ⴔⴔ ⴕⴕ ⴖⴖ ⴗⴗ ⴘⴘ ⴙⴙ ⴚⴚ ⴛⴛ ⴜⴜ ⴝⴝ ⴞⴞ ⴟⴟ ⴠⴠ ⴡⴡ ⴢⴢ ⴣⴣ ⴤⴤ ⴥⴥ');
|
||||
|
||||
// Performance test
|
||||
Write('Mattias LowerCase- Performance test took: ');
|
||||
|
||||
Loading…
Reference in New Issue
Block a user