{ /*************************************************************************** lazutf8.pas ***************************************************************************/ ***************************************************************************** * * * This file is part of LazUtils * * * * See the file COPYING.modifiedLGPL.txt, included in this distribution, * * for details about the copyright. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * * ***************************************************************************** Useful routines for managing UTF-8 strings } unit LazUTF8; {$mode objfpc}{$H+} interface uses {$ifdef windows} Windows, {$endif} Classes, SysUtils; // AnsiToUTF8 and UTF8ToAnsi need a widestring manager under Linux, BSD, MacOSX // but normally these OS use UTF-8 as system encoding so the widestringmanager // is not needed. function NeedRTLAnsi: boolean;// true if system encoding is not UTF-8 procedure SetNeedRTLAnsi(NewValue: boolean); function UTF8ToSys(const s: string): string;// as UTF8ToAnsi but more independent of widestringmanager function SysToUTF8(const s: string): string;// as AnsiToUTF8 but more independent of widestringmanager function UTF8CharacterLength(p: PChar): integer; function UTF8Length(const s: string): PtrInt; function UTF8Length(p: PChar; ByteCount: PtrInt): PtrInt; function UTF8CharacterToUnicode(p: PChar; out CharLen: integer): Cardinal; function UnicodeToUTF8(CodePoint: cardinal): string; function UnicodeToUTF8(CodePoint: cardinal; Buf: PChar): integer; function UnicodeToUTF8SkipErrors(CodePoint: cardinal; Buf: PChar): integer; function UnicodeToUTF8Inline(CodePoint: cardinal; Buf: PChar): integer; inline; function UTF8ToDoubleByteString(const s: string): string; function UTF8ToDoubleByte(UTF8Str: PChar; Len: PtrInt; DBStr: PByte): PtrInt; function UTF8FindNearestCharStart(UTF8Str: PChar; Len: integer; BytePos: integer): integer; // find the n-th UTF8 character, ignoring BIDI function UTF8CharStart(UTF8Str: PChar; Len, CharIndex: PtrInt): PChar; // find the byte index of the n-th UTF8 character, ignoring BIDI (byte len of substr) function UTF8CharToByteIndex(UTF8Str: PChar; Len, CharIndex: PtrInt): PtrInt; procedure UTF8FixBroken(P: PChar); overload; procedure UTF8FixBroken(var S: string); overload; function UTF8CharacterStrictLength(P: PChar): integer; function UTF8CStringToUTF8String(SourceStart: PChar; SourceLen: PtrInt) : string; function UTF8Pos(const SearchForText, SearchInText: string; StartPos: SizeInt = 1): PtrInt; function UTF8PosP(SearchForText: PChar; SearchForTextLen: SizeInt; SearchInText: PChar; SearchInTextLen: SizeInt): PChar; function UTF8Copy(const s: string; StartCharIndex, CharCount: PtrInt): string; procedure UTF8Delete(var s: String; StartCharIndex, CharCount: PtrInt); procedure UTF8Insert(const source: String; var s: string; StartCharIndex: PtrInt); function UTF8LowerCase(const AInStr: string; ALanguage: string=''): string; function UTF8UpperCase(const AInStr: string; ALanguage: string=''): string; function FindInvalidUTF8Character(p: PChar; Count: PtrInt; StopOnNonASCII: Boolean = false): PtrInt; function ValidUTF8String(const s: String): String; type TUTF8TrimFlag = ( u8tKeepStart, u8tKeepEnd, u8tKeepTabs, u8tKeepLineBreaks, u8tKeepNoBreakSpaces, u8tKeepControlCodes // excluding tabs and line breaks ); TUTF8TrimFlags = set of TUTF8TrimFlag; function UTF8Trim(const s: string; Flags: TUTF8TrimFlags = []): string; procedure AssignUTF8ListToAnsi(UTF8List, AnsiList: TStrings); //compare functions function UTF8CompareStr(const S1, S2: string): Integer; inline; function UTF8CompareText(const S1, S2: string): Integer; function CompareStrListUTF8LowerCase(List: TStringList; Index1, Index2: Integer): Integer; type TConvertResult = (trNoError, trNullSrc, trNullDest, trDestExhausted, trInvalidChar, trUnfinishedChar); TConvertOption = (toInvalidCharError, toInvalidCharToSymbol, toUnfinishedCharError, toUnfinishedCharToSymbol); TConvertOptions = set of TConvertOption; function ConvertUTF8ToUTF16(Dest: PWideChar; DestWideCharCount: SizeUInt; Src: PChar; SrcCharCount: SizeUInt; Options: TConvertOptions; out ActualWideCharCount: SizeUInt): TConvertResult; function ConvertUTF16ToUTF8(Dest: PChar; DestCharCount: SizeUInt; Src: PWideChar; SrcWideCharCount: SizeUInt; Options: TConvertOptions; out ActualCharCount: SizeUInt): TConvertResult; function UTF8ToUTF16(const S: AnsiString): UnicodeString; function UTF16ToUTF8(const S: UnicodeString): AnsiString; // locale procedure LazGetLanguageIDs(var Lang, FallbackLang: String); procedure LazGetShortLanguageID(var Lang: String); var FPUpChars: array[char] of char; implementation uses gettext {$IFDEF Darwin}, MacOSAll{$ENDIF} ; var FNeedRTLAnsi: boolean = false; FNeedRTLAnsiValid: boolean = false; function NeedRTLAnsi: boolean; {$IFDEF WinCE} // CP_UTF8 is missing in the windows unit of the Windows CE RTL const CP_UTF8 = 65001; {$ENDIF} {$IFNDEF Windows} var Lang: String; i: LongInt; Encoding: String; {$ENDIF} begin if FNeedRTLAnsiValid then exit(FNeedRTLAnsi); {$IFDEF Windows} FNeedRTLAnsi:=GetACP<>CP_UTF8; {$ELSE} FNeedRTLAnsi:=false; Lang := SysUtils.GetEnvironmentVariable('LC_ALL'); if lang = '' then begin Lang := SysUtils.GetEnvironmentVariable('LC_MESSAGES'); if Lang = '' then begin Lang := SysUtils.GetEnvironmentVariable('LANG'); end; end; i:=System.Pos('.',Lang); if (i>0) then begin Encoding:=copy(Lang,i+1,length(Lang)-i); FNeedRTLAnsi:=(SysUtils.CompareText(Encoding,'UTF-8')<>0) and (SysUtils.CompareText(Encoding,'UTF8')<>0); end; {$ENDIF} FNeedRTLAnsiValid:=true; Result:=FNeedRTLAnsi; end; procedure SetNeedRTLAnsi(NewValue: boolean); begin FNeedRTLAnsi:=NewValue; FNeedRTLAnsiValid:=true; end; function IsASCII(const s: string): boolean; inline; var i: Integer; begin for i:=1 to length(s) do if ord(s[i])>127 then exit(false); Result:=true; end; function UTF8ToSys(const s: string): string; begin if NeedRTLAnsi and (not IsASCII(s)) then Result:=UTF8ToAnsi(s) else Result:=s; end; function SysToUTF8(const s: string): string; begin if NeedRTLAnsi and (not IsASCII(s)) then begin Result:=AnsiToUTF8(s); {$ifdef FPC_HAS_CPSTRING} // prevent UTF8 codepage appear in the strings - we don't need codepage // conversion magic in LCL code SetCodePage(RawByteString(Result), StringCodePage(s), False); {$endif} end else Result:=s; end; function UTF8CharacterLength(p: PChar): integer; begin if p<>nil then begin if ord(p^)<%11000000 then begin // regular single byte character (#0 is a character, this is pascal ;) Result:=1; end else begin // multi byte if ((ord(p^) and %11100000) = %11000000) then begin // could be 2 byte character if (ord(p[1]) and %11000000) = %10000000 then Result:=2 else Result:=1; end else if ((ord(p^) and %11110000) = %11100000) then begin // could be 3 byte character if ((ord(p[1]) and %11000000) = %10000000) and ((ord(p[2]) and %11000000) = %10000000) then Result:=3 else Result:=1; end else if ((ord(p^) and %11111000) = %11110000) then begin // could be 4 byte character if ((ord(p[1]) and %11000000) = %10000000) and ((ord(p[2]) and %11000000) = %10000000) and ((ord(p[3]) and %11000000) = %10000000) then Result:=4 else Result:=1; end else Result:=1; end; end else Result:=0; end; function UTF8Length(const s: string): PtrInt; begin Result:=UTF8Length(PChar(s),length(s)); end; function UTF8Length(p: PChar; ByteCount: PtrInt): PtrInt; var CharLen: LongInt; begin Result:=0; while (ByteCount>0) do begin inc(Result); CharLen:=UTF8CharacterLength(p); inc(p,CharLen); dec(ByteCount,CharLen); end; end; function UTF8CharacterToUnicode(p: PChar; out CharLen: integer): Cardinal; { if p=nil then CharLen=0 otherwise CharLen>0 If there is an encoding error the Result is undefined. Use UTF8FixBroken to fix UTF-8 encoding. It does not check if the codepoint is defined in the Unicode tables. } begin if p<>nil then begin if ord(p^)<%11000000 then begin // regular single byte character (#0 is a normal char, this is pascal ;) Result:=ord(p^); CharLen:=1; end else if ((ord(p^) and %11100000) = %11000000) then begin // starts with %110 => could be double byte character if (ord(p[1]) and %11000000) = %10000000 then begin CharLen:=2; Result:=((ord(p^) and %00011111) shl 6) or (ord(p[1]) and %00111111); if Result<(1 shl 7) then begin // wrong encoded, could be an XSS attack Result:=0; end; end else begin Result:=ord(p^); CharLen:=1; end; end else if ((ord(p^) and %11110000) = %11100000) then begin // starts with %1110 => could be triple byte character if ((ord(p[1]) and %11000000) = %10000000) and ((ord(p[2]) and %11000000) = %10000000) then begin CharLen:=3; Result:=((ord(p^) and %00011111) shl 12) or ((ord(p[1]) and %00111111) shl 6) or (ord(p[2]) and %00111111); if Result<(1 shl 11) then begin // wrong encoded, could be an XSS attack Result:=0; end; end else begin Result:=ord(p^); CharLen:=1; end; end else if ((ord(p^) and %11111000) = %11110000) then begin // starts with %11110 => could be 4 byte character if ((ord(p[1]) and %11000000) = %10000000) and ((ord(p[2]) and %11000000) = %10000000) and ((ord(p[3]) and %11000000) = %10000000) then begin CharLen:=4; Result:=((ord(p^) and %00001111) shl 18) or ((ord(p[1]) and %00111111) shl 12) or ((ord(p[2]) and %00111111) shl 6) or (ord(p[3]) and %00111111); if Result<(1 shl 16) then begin // wrong encoded, could be an XSS attack Result:=0; end; end else begin Result:=ord(p^); CharLen:=1; end; end else begin // invalid character Result:=ord(p^); CharLen:=1; end; end else begin Result:=0; CharLen:=0; end; end; function UnicodeToUTF8(CodePoint: cardinal; Buf: PChar): integer; procedure RaiseInvalidUnicode; begin raise Exception.Create('UnicodeToUTF8: invalid unicode: '+IntToStr(CodePoint)); end; begin Result:=UnicodeToUTF8Inline(CodePoint,Buf); if Result=0 then RaiseInvalidUnicode; end; function UnicodeToUTF8SkipErrors(CodePoint: cardinal; Buf: PChar): integer; begin Result:=UnicodeToUTF8Inline(CodePoint,Buf); end; function UnicodeToUTF8(CodePoint: cardinal): string; var Buf: array[0..6] of Char; Len: Integer; begin Len:=UnicodeToUTF8Inline(CodePoint, @Buf[0]); Buf[Len]:=#0; Result := StrPas(@Buf[0]); end; function UnicodeToUTF8Inline(CodePoint: cardinal; Buf: PChar): integer; begin case CodePoint of 0..$7f: begin Result:=1; Buf[0]:=char(byte(CodePoint)); end; $80..$7ff: begin Result:=2; Buf[0]:=char(byte($c0 or (CodePoint shr 6))); Buf[1]:=char(byte($80 or (CodePoint and $3f))); end; $800..$ffff: begin Result:=3; Buf[0]:=char(byte($e0 or (CodePoint shr 12))); Buf[1]:=char(byte((CodePoint shr 6) and $3f) or $80); Buf[2]:=char(byte(CodePoint and $3f) or $80); end; $10000..$10ffff: begin Result:=4; Buf[0]:=char(byte($f0 or (CodePoint shr 18))); Buf[1]:=char(byte((CodePoint shr 12) and $3f) or $80); Buf[2]:=char(byte((CodePoint shr 6) and $3f) or $80); Buf[3]:=char(byte(CodePoint and $3f) or $80); end; else Result:=0; end; end; function UTF8ToDoubleByteString(const s: string): string; var Len: Integer; begin Len:=UTF8Length(s); SetLength(Result,Len*2); if Len=0 then exit; UTF8ToDoubleByte(PChar(s),length(s),PByte(Result)); end; { returns number of double bytes } function UTF8ToDoubleByte(UTF8Str: PChar; Len: PtrInt; DBStr: PByte): PtrInt; var SrcPos: PChar; CharLen: LongInt; DestPos: PByte; u: Cardinal; begin SrcPos:=UTF8Str; DestPos:=DBStr; Result:=0; while Len>0 do begin u:=UTF8CharacterToUnicode(SrcPos,CharLen); DestPos^:=byte((u shr 8) and $ff); inc(DestPos); DestPos^:=byte(u and $ff); inc(DestPos); inc(SrcPos,CharLen); dec(Len,CharLen); inc(Result); end; end; { Find the start of the UTF8 character which contains BytePos, Len is length in byte, BytePos starts at 0 } function UTF8FindNearestCharStart(UTF8Str: PChar; Len: integer; BytePos: integer): integer; begin Result:=0; if (UTF8Str<>nil) and (Len>0) and (BytePos>=0) then begin Result:=BytePos; if Result>Len then Result:=Len-1; if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin dec(Result); if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin dec(Result); if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin dec(Result); // should be four byte character if (ord(UTF8Str[Result]) and %11111000<>%11110000) then begin // broken UTF8 character inc(Result,3); end else begin // is four byte character end; end else if (ord(UTF8Str[Result]) and %11110000<>%11100000) then begin // broken UTF8 character, should be three byte inc(Result,2); end else begin // is three byte character end; end else if (ord(UTF8Str[Result]) and %11100000<>%11000000) then begin // broken UTF8 character, should be two byte inc(Result); end else begin // is two byte character end; end; end; end; { Len is the length in bytes of UTF8Str CharIndex is the position of the desired char (starting at 0), in chars } function UTF8CharStart(UTF8Str: PChar; Len, CharIndex: PtrInt): PChar; var CharLen: LongInt; begin Result:=UTF8Str; if Result<>nil then begin while (CharIndex>0) and (Len>0) do begin CharLen:=UTF8CharacterLength(Result); dec(Len,CharLen); dec(CharIndex); inc(Result,CharLen); end; if (CharIndex<>0) or (Len<0) then Result:=nil; end; end; function UTF8CharToByteIndex(UTF8Str: PChar; Len, CharIndex: PtrInt): PtrInt; var p: PChar; begin p := UTF8CharStart(UTF8Str, Len, CharIndex); if p = nil then Result := -1 else Result := p - UTF8Str; end; { fix any broken UTF8 sequences with spaces } procedure UTF8FixBroken(P: PChar); var c: cardinal; begin if p=nil then exit; while p^<>#0 do begin if ord(p^)<%10000000 then begin // regular single byte character inc(p); end else if ord(p^)<%11000000 then begin // invalid p^:=' '; inc(p); end else if ((ord(p^) and %11100000) = %11000000) then begin // starts with %110 => should be 2 byte character if ((ord(p[1]) and %11000000) = %10000000) then begin c:=((ord(p^) and %00011111) shl 6); //or (ord(p[1]) and %00111111); if c<(1 shl 7) then p^:=' ' // fix XSS attack else inc(p,2) end else if p[1]<>#0 then p^:=' '; end else if ((ord(p^) and %11110000) = %11100000) then begin // starts with %1110 => should be 3 byte character if ((ord(p[1]) and %11000000) = %10000000) and ((ord(p[2]) and %11000000) = %10000000) then begin c:=((ord(p^) and %00011111) shl 12) or ((ord(p[1]) and %00111111) shl 6); //or (ord(p[2]) and %00111111); if c<(1 shl 11) then p^:=' ' // fix XSS attack else inc(p,3); end else p^:=' '; end else if ((ord(p^) and %11111000) = %11110000) then begin // starts with %11110 => should be 4 byte character if ((ord(p[1]) and %11000000) = %10000000) and ((ord(p[2]) and %11000000) = %10000000) and ((ord(p[3]) and %11000000) = %10000000) then begin c:=((ord(p^) and %00001111) shl 18) or ((ord(p[1]) and %00111111) shl 12) or ((ord(p[2]) and %00111111) shl 6); //or (ord(p[3]) and %00111111); if c<(1 shl 16) then p^:=' ' // fix XSS attack else inc(p,4) end else p^:=' '; end else begin p^:=' '; inc(p); end; end; end; procedure UTF8FixBroken(var S: string); begin if S='' then exit; if FindInvalidUTF8Character(PChar(S),length(S),true)<0 then exit; UniqueString(S); UTF8FixBroken(PChar(S)); end; function UTF8CharacterStrictLength(P: PChar): integer; begin if p=nil then exit(0); if ord(p^)<%10000000 then begin // regular single byte character exit(1); end else if ord(p^)<%11000000 then begin // invalid single byte character exit(0); end else if ((ord(p^) and %11100000) = %11000000) then begin // should be 2 byte character if (ord(p[1]) and %11000000) = %10000000 then exit(2) else exit(0); end else if ((ord(p^) and %11110000) = %11100000) then begin // should be 3 byte character if ((ord(p[1]) and %11000000) = %10000000) and ((ord(p[2]) and %11000000) = %10000000) then exit(3) else exit(0); end else if ((ord(p^) and %11111000) = %11110000) then begin // should be 4 byte character if ((ord(p[1]) and %11000000) = %10000000) and ((ord(p[2]) and %11000000) = %10000000) and ((ord(p[3]) and %11000000) = %10000000) then exit(4) else exit(0); end else exit(0); end; function UTF8CStringToUTF8String(SourceStart: PChar; SourceLen: PtrInt) : string; var Source: PChar; Dest: PChar; SourceEnd: PChar; SourceCopied: PChar; // Copies from SourceStart till Source to Dest and updates Dest procedure CopyPart; inline; var CopyLength: SizeInt; begin CopyLength := Source - SourceCopied; if CopyLength=0 then exit; System.move(SourceCopied^ , Dest^, CopyLength); SourceCopied:=Source; inc(Dest, CopyLength); end; begin SetLength(Result, SourceLen); if SourceLen=0 then exit; SourceCopied:=SourceStart; Source:=SourceStart; Dest:=PChar(Result); SourceEnd := Source + SourceLen; while Source0 then Result:=UTF8Length(PChar(SearchInText),i-1)+1; end else if StartPos>1 then begin // skip StartPosP:=UTF8CharStart(PChar(SearchInText),Length(SearchInText),StartPos-1); if StartPosP=nil then exit; // search p:=UTF8PosP(PChar(SearchForText),length(SearchForText), StartPosP,length(SearchInText)+PChar(SearchInText)-StartPosP); // get UTF-8 position if p=nil then exit; Result:=StartPos+UTF8Length(StartPosP,p-StartPosP); end; end; function UTF8PosP(SearchForText: PChar; SearchForTextLen: SizeInt; SearchInText: PChar; SearchInTextLen: SizeInt): PChar; // returns the position where SearchInText starts in SearchForText // returns nil if not found var p: SizeInt; begin Result:=nil; if (SearchForText=nil) or (SearchForTextLen=0) or (SearchInText=nil) then exit; while SearchInTextLen>0 do begin p:=IndexByte(SearchInText^,SearchInTextLen,PByte(SearchForText)^); if p<0 then exit; inc(SearchInText,p); dec(SearchInTextLen,p); if SearchInTextLen nil then begin MaxBytes:=PtrInt(PChar(s)+length(s)-StartBytePos); EndBytePos:=UTF8CharStart(StartBytePos,MaxBytes,CharCount); if EndBytePos=nil then Delete(s,StartBytePos-PChar(s)+1,MaxBytes) else Delete(s,StartBytePos-PChar(s)+1,EndBytePos-StartBytePos); end; end; procedure UTF8Insert(const source: String; var s: string; StartCharIndex: PtrInt); var StartBytePos: PChar; begin StartBytePos:=UTF8CharStart(PChar(s),length(s),StartCharIndex-1); if StartBytePos <> nil then Insert(source, s, StartBytePos-PChar(s)+1); end; { AInStr - The input string ALanguage - The language. Use '' for maximum speed if one desires to ignore the language The language should be specified in the format from ISO 639-1, which uses 2 characters to represent each language. If the language has no code in ISO 639-1, then the 3-chars code from ISO 639-2 should be used. Example: "tr" - Turkish language locale Data from here: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt The columns in the file UnicodeData.txt are explained here: http://www.ksu.ru/eng/departments/ktk/test/perl/lib/unicode/UCDFF301.html#Case Mappings } function UTF8LowerCase(const AInStr: string; ALanguage: string=''): string; var CounterDiff: PtrInt; InStr, InStrEnd, OutStr: PChar; // Language identification IsTurkish: Boolean; c1, c2, c3, new_c1, new_c2, new_c3: Char; p: SizeInt; begin Result:=AInStr; InStr := PChar(AInStr); InStrEnd := InStr + length(AInStr); // points behind last char // Does a fast initial parsing of the string to maybe avoid doing // UniqueString if the resulting string will be identical while (InStr < InStrEnd) do begin c1 := InStr^; case c1 of 'A'..'Z': Break; #$C3..#$C9, #$CE, #$CF, #$D0..#$D5, #$E1..#$E2,#$E5: begin c2 := InStr[1]; case c1 of #$C3: if c2 in [#$80..#$9E] then Break; #$C4: begin case c2 of #$80..#$AF, #$B2..#$B6: if ord(c2) mod 2 = 0 then Break; #$B8..#$FF: if ord(c2) mod 2 = 1 then Break; #$B0: Break; end; end; #$C5: begin case c2 of #$8A..#$B7: if ord(c2) mod 2 = 0 then Break; #$00..#$88, #$B9..#$FF: if ord(c2) mod 2 = 1 then Break; #$B8: Break; end; end; // Process E5 to avoid stopping on chinese chars #$E5: if (c2 = #$BC) and (InStr[2] in [#$A1..#$BA]) then Break; // Others are too complex, better not to pre-inspect them else Break; end; // already lower, or otherwhise not affected end; end; inc(InStr); end; if InStr >= InStrEnd then Exit; // Language identification IsTurkish := (ALanguage = 'tr') or (ALanguage = 'az'); // Turkish and Azeri have a special handling UniqueString(Result); OutStr := PChar(Result) + (InStr - PChar(AInStr)); CounterDiff := 0; while InStr < InStrEnd do begin c1 := InStr^; case c1 of // codepoints UTF-8 range Description Case change // $0041..$005A $41..$5A Capital ASCII X+$20 'A'..'Z': begin { First ASCII chars } // Special turkish handling // capital undotted I to small undotted i if IsTurkish and (c1 = 'I') then begin p:=OutStr - PChar(Result); SetLength(Result,Length(Result)+1);// Increase the buffer OutStr := PChar(Result)+p; OutStr^ := #$C4; inc(OutStr); OutStr^ := #$B1; dec(CounterDiff); end else begin OutStr^ := chr(ord(c1)+32); end; inc(InStr); inc(OutStr); end; // Chars with 2-bytes which might be modified #$C3..#$D5: begin c2 := InStr[1]; new_c1 := c1; new_c2 := c2; case c1 of // Latin Characters 0000–0FFF http://en.wikibooks.org/wiki/Unicode/Character_reference/0000-0FFF // codepoints UTF-8 range Description Case change // $00C0..$00D6 C3 80..C3 96 Capital Latin with accents X+$20 // $D7 C3 97 Multiplication Sign N/A // $00D8..$00DE C3 98..C3 9E Capital Latin with accents X+$20 // $DF C3 9F German beta ß already lowercase #$C3: begin case c2 of #$80..#$96, #$98..#$9E: new_c2 := chr(ord(c2) + $20) end; end; // $0100..$012F C4 80..C4 AF Capital/Small Latin accents if mod 2 = 0 then X+1 // $0130..$0131 C4 B0..C4 B1 Turkish // C4 B0 turkish uppercase dotted i -> 'i' // C4 B1 turkish lowercase undotted ı // $0132..$0137 C4 B2..C4 B7 Capital/Small Latin accents if mod 2 = 0 then X+1 // $0138 C4 B8 ĸ N/A // $0139..$024F C4 B9..C5 88 Capital/Small Latin accents if mod 2 = 1 then X+1 #$C4: begin case c2 of #$80..#$AF, #$B2..#$B7: if ord(c2) mod 2 = 0 then new_c2 := chr(ord(c2) + 1); #$B0: // Turkish begin OutStr^ := 'i'; inc(InStr, 2); inc(OutStr); inc(CounterDiff, 1); Continue; end; #$B9..#$BE: if ord(c2) mod 2 = 1 then new_c2 := chr(ord(c2) + 1); #$BF: // This crosses the borders between the first byte of the UTF-8 char begin new_c1 := #$C5; new_c2 := #$80; end; end; end; // $C589 ʼn // $C58A..$C5B7: if OldChar mod 2 = 0 then NewChar := OldChar + 1; // $C5B8: NewChar := $C3BF; // Ÿ // $C5B9..$C8B3: if OldChar mod 2 = 1 then NewChar := OldChar + 1; #$C5: begin case c2 of #$8A..#$B7: //0 begin if ord(c2) mod 2 = 0 then new_c2 := chr(ord(c2) + 1); end; #$00..#$88, #$B9..#$BE: //1 begin if ord(c2) mod 2 = 1 then new_c2 := chr(ord(c2) + 1); end; #$B8: // Ÿ begin new_c1 := #$C3; new_c2 := #$BF; end; end; end; {A convoluted part: C6 80..C6 8F 0180;LATIN SMALL LETTER B WITH STROKE;Ll;0;L;;;;;N;LATIN SMALL LETTER B BAR;;0243;;0243 0181;LATIN CAPITAL LETTER B WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER B HOOK;;;0253; => C6 81=>C9 93 0182;LATIN CAPITAL LETTER B WITH TOPBAR;Lu;0;L;;;;;N;LATIN CAPITAL LETTER B TOPBAR;;;0183; 0183;LATIN SMALL LETTER B WITH TOPBAR;Ll;0;L;;;;;N;LATIN SMALL LETTER B TOPBAR;;0182;;0182 0184;LATIN CAPITAL LETTER TONE SIX;Lu;0;L;;;;;N;;;;0185; 0185;LATIN SMALL LETTER TONE SIX;Ll;0;L;;;;;N;;;0184;;0184 0186;LATIN CAPITAL LETTER OPEN O;Lu;0;L;;;;;N;;;;0254; ==> C9 94 0187;LATIN CAPITAL LETTER C WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER C HOOK;;;0188; 0188;LATIN SMALL LETTER C WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER C HOOK;;0187;;0187 0189;LATIN CAPITAL LETTER AFRICAN D;Lu;0;L;;;;;N;;;;0256; => C9 96 018A;LATIN CAPITAL LETTER D WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER D HOOK;;;0257; => C9 97 018B;LATIN CAPITAL LETTER D WITH TOPBAR;Lu;0;L;;;;;N;LATIN CAPITAL LETTER D TOPBAR;;;018C; 018C;LATIN SMALL LETTER D WITH TOPBAR;Ll;0;L;;;;;N;LATIN SMALL LETTER D TOPBAR;;018B;;018B 018D;LATIN SMALL LETTER TURNED DELTA;Ll;0;L;;;;;N;;;;; 018E;LATIN CAPITAL LETTER REVERSED E;Lu;0;L;;;;;N;LATIN CAPITAL LETTER TURNED E;;;01DD; => C7 9D 018F;LATIN CAPITAL LETTER SCHWA;Lu;0;L;;;;;N;;;;0259; => C9 99 } #$C6: begin case c2 of #$81: begin new_c1 := #$C9; new_c2 := #$93; end; #$82..#$85: begin if ord(c2) mod 2 = 0 then new_c2 := chr(ord(c2) + 1); end; #$87..#$88,#$8B..#$8C: begin if ord(c2) mod 2 = 1 then new_c2 := chr(ord(c2) + 1); end; #$86: begin new_c1 := #$C9; new_c2 := #$94; end; #$89: begin new_c1 := #$C9; new_c2 := #$96; end; #$8A: begin new_c1 := #$C9; new_c2 := #$97; end; #$8E: begin new_c1 := #$C7; new_c2 := #$9D; end; #$8F: begin new_c1 := #$C9; new_c2 := #$99; end; { And also C6 90..C6 9F 0190;LATIN CAPITAL LETTER OPEN E;Lu;0;L;;;;;N;LATIN CAPITAL LETTER EPSILON;;;025B; => C9 9B 0191;LATIN CAPITAL LETTER F WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER F HOOK;;;0192; => +1 0192;LATIN SMALL LETTER F WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER SCRIPT F;;0191;;0191 <= 0193;LATIN CAPITAL LETTER G WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER G HOOK;;;0260; => C9 A0 0194;LATIN CAPITAL LETTER GAMMA;Lu;0;L;;;;;N;;;;0263; => C9 A3 0195;LATIN SMALL LETTER HV;Ll;0;L;;;;;N;LATIN SMALL LETTER H V;;01F6;;01F6 <= 0196;LATIN CAPITAL LETTER IOTA;Lu;0;L;;;;;N;;;;0269; => C9 A9 0197;LATIN CAPITAL LETTER I WITH STROKE;Lu;0;L;;;;;N;LATIN CAPITAL LETTER BARRED I;;;0268; => C9 A8 0198;LATIN CAPITAL LETTER K WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER K HOOK;;;0199; => +1 0199;LATIN SMALL LETTER K WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER K HOOK;;0198;;0198 <= 019A;LATIN SMALL LETTER L WITH BAR;Ll;0;L;;;;;N;LATIN SMALL LETTER BARRED L;;023D;;023D <= 019B;LATIN SMALL LETTER LAMBDA WITH STROKE;Ll;0;L;;;;;N;LATIN SMALL LETTER BARRED LAMBDA;;;; <= 019C;LATIN CAPITAL LETTER TURNED M;Lu;0;L;;;;;N;;;;026F; => C9 AF 019D;LATIN CAPITAL LETTER N WITH LEFT HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER N HOOK;;;0272; => C9 B2 019E;LATIN SMALL LETTER N WITH LONG RIGHT LEG;Ll;0;L;;;;;N;;;0220;;0220 <= 019F;LATIN CAPITAL LETTER O WITH MIDDLE TILDE;Lu;0;L;;;;;N;LATIN CAPITAL LETTER BARRED O;;;0275; => C9 B5 } #$90: begin new_c1 := #$C9; new_c2 := #$9B; end; #$91, #$98: new_c2 := chr(ord(c2)+1); #$93: begin new_c1 := #$C9; new_c2 := #$A0; end; #$94: begin new_c1 := #$C9; new_c2 := #$A3; end; #$96: begin new_c1 := #$C9; new_c2 := #$A9; end; #$97: begin new_c1 := #$C9; new_c2 := #$A8; end; #$9C: begin new_c1 := #$C9; new_c2 := #$AF; end; #$9D: begin new_c1 := #$C9; new_c2 := #$B2; end; #$9F: begin new_c1 := #$C9; new_c2 := #$B5; end; { And also C6 A0..C6 AF 01A0;LATIN CAPITAL LETTER O WITH HORN;Lu;0;L;004F 031B;;;;N;LATIN CAPITAL LETTER O HORN;;;01A1; => +1 01A1;LATIN SMALL LETTER O WITH HORN;Ll;0;L;006F 031B;;;;N;LATIN SMALL LETTER O HORN;;01A0;;01A0 <= 01A2;LATIN CAPITAL LETTER OI;Lu;0;L;;;;;N;LATIN CAPITAL LETTER O I;;;01A3; => +1 01A3;LATIN SMALL LETTER OI;Ll;0;L;;;;;N;LATIN SMALL LETTER O I;;01A2;;01A2 <= 01A4;LATIN CAPITAL LETTER P WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER P HOOK;;;01A5; => +1 01A5;LATIN SMALL LETTER P WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER P HOOK;;01A4;;01A4 <= 01A6;LATIN LETTER YR;Lu;0;L;;;;;N;LATIN LETTER Y R;;;0280; => CA 80 01A7;LATIN CAPITAL LETTER TONE TWO;Lu;0;L;;;;;N;;;;01A8; => +1 01A8;LATIN SMALL LETTER TONE TWO;Ll;0;L;;;;;N;;;01A7;;01A7 <= 01A9;LATIN CAPITAL LETTER ESH;Lu;0;L;;;;;N;;;;0283; => CA 83 01AA;LATIN LETTER REVERSED ESH LOOP;Ll;0;L;;;;;N;;;;; 01AB;LATIN SMALL LETTER T WITH PALATAL HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER T PALATAL HOOK;;;; <= 01AC;LATIN CAPITAL LETTER T WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER T HOOK;;;01AD; => +1 01AD;LATIN SMALL LETTER T WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER T HOOK;;01AC;;01AC <= 01AE;LATIN CAPITAL LETTER T WITH RETROFLEX HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER T RETROFLEX HOOK;;;0288; => CA 88 01AF;LATIN CAPITAL LETTER U WITH HORN;Lu;0;L;0055 031B;;;;N;LATIN CAPITAL LETTER U HORN;;;01B0; => +1 } #$A0..#$A5,#$AC: begin if ord(c2) mod 2 = 0 then new_c2 := chr(ord(c2) + 1); end; #$A7,#$AF: begin if ord(c2) mod 2 = 1 then new_c2 := chr(ord(c2) + 1); end; #$A6: begin new_c1 := #$CA; new_c2 := #$80; end; #$A9: begin new_c1 := #$CA; new_c2 := #$83; end; #$AE: begin new_c1 := #$CA; new_c2 := #$88; end; { And also C6 B0..C6 BF 01B0;LATIN SMALL LETTER U WITH HORN;Ll;0;L;0075 031B;;;;N;LATIN SMALL LETTER U HORN;;01AF;;01AF <= -1 01B1;LATIN CAPITAL LETTER UPSILON;Lu;0;L;;;;;N;;;;028A; => CA 8A 01B2;LATIN CAPITAL LETTER V WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER SCRIPT V;;;028B; => CA 8B 01B3;LATIN CAPITAL LETTER Y WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER Y HOOK;;;01B4; => +1 01B4;LATIN SMALL LETTER Y WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER Y HOOK;;01B3;;01B3 <= 01B5;LATIN CAPITAL LETTER Z WITH STROKE;Lu;0;L;;;;;N;LATIN CAPITAL LETTER Z BAR;;;01B6; => +1 01B6;LATIN SMALL LETTER Z WITH STROKE;Ll;0;L;;;;;N;LATIN SMALL LETTER Z BAR;;01B5;;01B5 <= 01B7;LATIN CAPITAL LETTER EZH;Lu;0;L;;;;;N;LATIN CAPITAL LETTER YOGH;;;0292; => CA 92 01B8;LATIN CAPITAL LETTER EZH REVERSED;Lu;0;L;;;;;N;LATIN CAPITAL LETTER REVERSED YOGH;;;01B9; => +1 01B9;LATIN SMALL LETTER EZH REVERSED;Ll;0;L;;;;;N;LATIN SMALL LETTER REVERSED YOGH;;01B8;;01B8 <= 01BA;LATIN SMALL LETTER EZH WITH TAIL;Ll;0;L;;;;;N;LATIN SMALL LETTER YOGH WITH TAIL;;;; <= 01BB;LATIN LETTER TWO WITH STROKE;Lo;0;L;;;;;N;LATIN LETTER TWO BAR;;;; X 01BC;LATIN CAPITAL LETTER TONE FIVE;Lu;0;L;;;;;N;;;;01BD; => +1 01BD;LATIN SMALL LETTER TONE FIVE;Ll;0;L;;;;;N;;;01BC;;01BC <= 01BE;LATIN LETTER INVERTED GLOTTAL STOP WITH STROKE;Ll;0;L;;;;;N;LATIN LETTER INVERTED GLOTTAL STOP BAR;;;; X 01BF;LATIN LETTER WYNN;Ll;0;L;;;;;N;;;01F7;;01F7 <= } #$B8,#$BC: begin if ord(c2) mod 2 = 0 then new_c2 := chr(ord(c2) + 1); end; #$B3..#$B6: begin if ord(c2) mod 2 = 1 then new_c2 := chr(ord(c2) + 1); end; #$B1: begin new_c1 := #$CA; new_c2 := #$8A; end; #$B2: begin new_c1 := #$CA; new_c2 := #$8B; end; #$B7: begin new_c1 := #$CA; new_c2 := #$92; end; end; end; #$C7: begin case c2 of #$84..#$8C,#$B1..#$B3: begin if (ord(c2) and $F) mod 3 = 1 then new_c2 := chr(ord(c2) + 2) else if (ord(c2) and $F) mod 3 = 2 then new_c2 := chr(ord(c2) + 1); end; #$8D..#$9C: begin if ord(c2) mod 2 = 1 then new_c2 := chr(ord(c2) + 1); end; #$9E..#$AF,#$B4..#$B5,#$B8..#$BF: begin if ord(c2) mod 2 = 0 then new_c2 := chr(ord(c2) + 1); end; { 01F6;LATIN CAPITAL LETTER HWAIR;Lu;0;L;;;;;N;;;;0195; 01F7;LATIN CAPITAL LETTER WYNN;Lu;0;L;;;;;N;;;;01BF; } #$B6: begin new_c1 := #$C6; new_c2 := #$95; end; #$B7: begin new_c1 := #$C6; new_c2 := #$BF; end; end; end; { Codepoints 0200 to 023F } #$C8: begin // For this one we can simply start with a default and override for some specifics if (c2 in [#$80..#$B3]) and (ord(c2) mod 2 = 0) then new_c2 := chr(ord(c2) + 1); case c2 of #$A0: begin new_c1 := #$C6; new_c2 := #$9E; end; #$A1: new_c2 := c2; { 023A;LATIN CAPITAL LETTER A WITH STROKE;Lu;0;L;;;;;N;;;;2C65; => E2 B1 A5 023B;LATIN CAPITAL LETTER C WITH STROKE;Lu;0;L;;;;;N;;;;023C; => +1 023C;LATIN SMALL LETTER C WITH STROKE;Ll;0;L;;;;;N;;;023B;;023B <= 023D;LATIN CAPITAL LETTER L WITH BAR;Lu;0;L;;;;;N;;;;019A; => C6 9A 023E;LATIN CAPITAL LETTER T WITH DIAGONAL STROKE;Lu;0;L;;;;;N;;;;2C66; => E2 B1 A6 023F;LATIN SMALL LETTER S WITH SWASH TAIL;Ll;0;L;;;;;N;;;2C7E;;2C7E <= 0240;LATIN SMALL LETTER Z WITH SWASH TAIL;Ll;0;L;;;;;N;;;2C7F;;2C7F <= } #$BA,#$BE: begin p:= OutStr - PChar(Result); SetLength(Result,Length(Result)+1);// Increase the buffer OutStr := PChar(Result)+p; OutStr^ := #$E2; inc(OutStr); OutStr^ := #$B1; inc(OutStr); if c2 = #$BA then OutStr^ := #$A5 else OutStr^ := #$A6; dec(CounterDiff); inc(OutStr); inc(InStr, 2); Continue; end; #$BD: begin new_c1 := #$C6; new_c2 := #$9A; end; #$BB: new_c2 := chr(ord(c2) + 1); end; end; { Codepoints 0240 to 027F Here only 0240..024F needs lowercase } #$C9: begin case c2 of #$81..#$82: begin if ord(c2) mod 2 = 1 then new_c2 := chr(ord(c2) + 1); end; #$86..#$8F: begin if ord(c2) mod 2 = 0 then new_c2 := chr(ord(c2) + 1); end; #$83: begin new_c1 := #$C6; new_c2 := #$80; end; #$84: begin new_c1 := #$CA; new_c2 := #$89; end; #$85: begin new_c1 := #$CA; new_c2 := #$8C; end; end; end; // $CE91..$CE9F: NewChar := OldChar + $20; // Greek Characters // $CEA0..$CEA9: NewChar := OldChar + $E0; // Greek Characters #$CE: begin case c2 of // 0380 = CE 80 #$86: new_c2 := #$AC; #$88: new_c2 := #$AD; #$89: new_c2 := #$AE; #$8A: new_c2 := #$AF; #$8C: new_c1 := #$CF; // By coincidence new_c2 remains the same #$8E: begin new_c1 := #$CF; new_c2 := #$8D; end; #$8F: begin new_c1 := #$CF; new_c2 := #$8E; end; // 0390 = CE 90 #$91..#$9F: begin new_c2 := chr(ord(c2) + $20); end; // 03A0 = CE A0 #$A0..#$AB: begin new_c1 := #$CF; new_c2 := chr(ord(c2) - $20); end; end; end; // 03C0 = CF 80 // 03D0 = CF 90 // 03E0 = CF A0 // 03F0 = CF B0 #$CF: begin case c2 of // 03CF;GREEK CAPITAL KAI SYMBOL;Lu;0;L;;;;;N;;;;03D7; CF 8F => CF 97 #$8F: new_c2 := #$97; // 03D8;GREEK LETTER ARCHAIC KOPPA;Lu;0;L;;;;;N;;;;03D9; #$98: new_c2 := #$99; // 03DA;GREEK LETTER STIGMA;Lu;0;L;;;;;N;GREEK CAPITAL LETTER STIGMA;;;03DB; #$9A: new_c2 := #$9B; // 03DC;GREEK LETTER DIGAMMA;Lu;0;L;;;;;N;GREEK CAPITAL LETTER DIGAMMA;;;03DD; #$9C: new_c2 := #$9D; // 03DE;GREEK LETTER KOPPA;Lu;0;L;;;;;N;GREEK CAPITAL LETTER KOPPA;;;03DF; #$9E: new_c2 := #$9F; { 03E0;GREEK LETTER SAMPI;Lu;0;L;;;;;N;GREEK CAPITAL LETTER SAMPI;;;03E1; 03E1;GREEK SMALL LETTER SAMPI;Ll;0;L;;;;;N;;;03E0;;03E0 03E2;COPTIC CAPITAL LETTER SHEI;Lu;0;L;;;;;N;GREEK CAPITAL LETTER SHEI;;;03E3; 03E3;COPTIC SMALL LETTER SHEI;Ll;0;L;;;;;N;GREEK SMALL LETTER SHEI;;03E2;;03E2 ... 03EE;COPTIC CAPITAL LETTER DEI;Lu;0;L;;;;;N;GREEK CAPITAL LETTER DEI;;;03EF; 03EF;COPTIC SMALL LETTER DEI;Ll;0;L;;;;;N;GREEK SMALL LETTER DEI;;03EE;;03EE } #$A0..#$AF: if ord(c2) mod 2 = 0 then new_c2 := chr(ord(c2) + 1); // 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L; 0398;;;;N;;;;03B8; #$B4: begin new_c1 := #$CE; new_c2 := #$B8; end; // 03F7;GREEK CAPITAL LETTER SHO;Lu;0;L;;;;;N;;;;03F8; #$B7: new_c2 := #$B8; // 03F9;GREEK CAPITAL LUNATE SIGMA SYMBOL;Lu;0;L; 03A3;;;;N;;;;03F2; #$B9: new_c2 := #$B2; // 03FA;GREEK CAPITAL LETTER SAN;Lu;0;L;;;;;N;;;;03FB; #$BA: new_c2 := #$BB; // 03FD;GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL;Lu;0;L;;;;;N;;;;037B; #$BD: begin new_c1 := #$CD; new_c2 := #$BB; end; // 03FE;GREEK CAPITAL DOTTED LUNATE SIGMA SYMBOL;Lu;0;L;;;;;N;;;;037C; #$BE: begin new_c1 := #$CD; new_c2 := #$BC; end; // 03FF;GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL;Lu;0;L;;;;;N;;;;037D; #$BF: begin new_c1 := #$CD; new_c2 := #$BD; end; end; end; // $D080..$D08F: NewChar := OldChar + $110; // Cyrillic alphabet // $D090..$D09F: NewChar := OldChar + $20; // Cyrillic alphabet // $D0A0..$D0AF: NewChar := OldChar + $E0; // Cyrillic alphabet #$D0: begin c2 := InStr[1]; case c2 of #$80..#$8F: begin new_c1 := chr(ord(c1)+1); new_c2 := chr(ord(c2) + $10); end; #$90..#$9F: begin new_c2 := chr(ord(c2) + $20); end; #$A0..#$AF: begin new_c1 := chr(ord(c1)+1); new_c2 := chr(ord(c2) - $20); end; end; end; // Archaic and non-slavic cyrillic 460-47F = D1A0-D1BF // These require just adding 1 to get the lowercase #$D1: begin if (c2 in [#$A0..#$BF]) and (ord(c2) mod 2 = 0) then new_c2 := chr(ord(c2) + 1); end; // Archaic and non-slavic cyrillic 480-4BF = D280-D2BF // These mostly require just adding 1 to get the lowercase #$D2: begin case c2 of #$80: begin new_c2 := chr(ord(c2) + 1); end; // #$81 is already lowercase // #$82-#$89 ??? #$8A..#$BF: begin if ord(c2) mod 2 = 0 then new_c2 := chr(ord(c2) + 1); end; end; end; { Codepoints 04C0..04FF } #$D3: begin case c2 of #$80: new_c2 := #$8F; #$81..#$8E: begin if ord(c2) mod 2 = 1 then new_c2 := chr(ord(c2) + 1); end; #$90..#$BF: begin if ord(c2) mod 2 = 0 then new_c2 := chr(ord(c2) + 1); end; end; end; { Codepoints 0500..053F Armenian starts in 0531 } #$D4: begin if ord(c2) mod 2 = 0 then new_c2 := chr(ord(c2) + 1); // Armenian if c2 in [#$B1..#$BF] then begin new_c1 := #$D5; new_c2 := chr(ord(c2) - $10); end; end; { Codepoints 0540..057F Armenian } #$D5: begin case c2 of #$80..#$8F: begin new_c2 := chr(ord(c2) + $30); end; #$90..#$96: begin new_c1 := #$D6; new_c2 := chr(ord(c2) - $10); end; end; end; end; // Common code 2-byte modifiable chars if (CounterDiff <> 0) then begin OutStr^ := new_c1; OutStr[1] := new_c2; end else begin if (new_c1 <> c1) then OutStr^ := new_c1; if (new_c2 <> c2) then OutStr[1] := new_c2; end; inc(InStr, 2); inc(OutStr, 2); end; { Characters with 3 bytes } #$E1: begin new_c1 := c1; c2 := InStr[1]; c3 := InStr[2]; new_c2 := c2; new_c3 := c3; { Georgian codepoints 10A0-10C5 => 2D00-2D25 In UTF-8 this is: E1 82 A0 - E1 82 BF => E2 B4 80 - E2 B4 9F E1 83 80 - E1 83 85 => E2 B4 A0 - E2 B4 A5 } case c2 of #$82: if (c3 in [#$A0..#$BF]) then begin new_c1 := #$E2; new_c2 := #$B4; new_c3 := chr(ord(c3) - $20); end; #$83: if (c3 in [#$80..#$85]) then begin new_c1 := #$E2; new_c2 := #$B4; new_c3 := chr(ord(c3) + $20); end; { Extra chars between 1E00..1EFF Blocks of chars: 1E00..1E3F E1 B8 80..E1 B8 BF 1E40..1E7F E1 B9 80..E1 B9 BF 1E80..1EBF E1 BA 80..E1 BA BF 1EC0..1EFF E1 BB 80..E1 BB BF } #$B8..#$BB: begin // Start with a default and change for some particular chars if ord(c3) mod 2 = 0 then new_c3 := chr(ord(c3) + 1); { Only 1E96..1E9F are different E1 BA 96..E1 BA 9F 1E96;LATIN SMALL LETTER H WITH LINE BELOW;Ll;0;L;0068 0331;;;;N;;;;; 1E97;LATIN SMALL LETTER T WITH DIAERESIS;Ll;0;L;0074 0308;;;;N;;;;; 1E98;LATIN SMALL LETTER W WITH RING ABOVE;Ll;0;L;0077 030A;;;;N;;;;; 1E99;LATIN SMALL LETTER Y WITH RING ABOVE;Ll;0;L;0079 030A;;;;N;;;;; 1E9A;LATIN SMALL LETTER A WITH RIGHT HALF RING;Ll;0;L; 0061 02BE;;;;N;;;;; 1E9B;LATIN SMALL LETTER LONG S WITH DOT ABOVE;Ll;0;L;017F 0307;;;;N;;;1E60;;1E60 1E9C;LATIN SMALL LETTER LONG S WITH DIAGONAL STROKE;Ll;0;L;;;;;N;;;;; 1E9D;LATIN SMALL LETTER LONG S WITH HIGH STROKE;Ll;0;L;;;;;N;;;;; 1E9E;LATIN CAPITAL LETTER SHARP S;Lu;0;L;;;;;N;;;;00DF; => C3 9F 1E9F;LATIN SMALL LETTER DELTA;Ll;0;L;;;;;N;;;;; } if (c2 = #$BA) and (c3 in [#$96..#$9F]) then new_c3 := c3; // LATIN CAPITAL LETTER SHARP S => to german Beta if (c2 = #$BA) and (c3 = #$9E) then begin inc(InStr, 3); OutStr^ := #$C3; inc(OutStr); OutStr^ := #$9F; inc(OutStr); inc(CounterDiff, 1); Continue; end; end; { Extra chars between 1F00..1FFF Blocks of chars: 1E00..1E3F E1 BC 80..E1 BC BF 1E40..1E7F E1 BD 80..E1 BD BF 1E80..1EBF E1 BE 80..E1 BE BF 1EC0..1EFF E1 BF 80..E1 BF BF } #$BC: begin // Start with a default and change for some particular chars if (ord(c3) mod $10) div 8 = 1 then new_c3 := chr(ord(c3) - 8); end; #$BD: begin // Start with a default and change for some particular chars case c3 of #$80..#$8F, #$A0..#$AF: if (ord(c3) mod $10) div 8 = 1 then new_c3 := chr(ord(c3) - 8); { 1F50;GREEK SMALL LETTER UPSILON WITH PSILI;Ll;0;L;03C5 0313;;;;N;;;;; 1F51;GREEK SMALL LETTER UPSILON WITH DASIA;Ll;0;L;03C5 0314;;;;N;;;1F59;;1F59 1F52;GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA;Ll;0;L;1F50 0300;;;;N;;;;; 1F53;GREEK SMALL LETTER UPSILON WITH DASIA AND VARIA;Ll;0;L;1F51 0300;;;;N;;;1F5B;;1F5B 1F54;GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA;Ll;0;L;1F50 0301;;;;N;;;;; 1F55;GREEK SMALL LETTER UPSILON WITH DASIA AND OXIA;Ll;0;L;1F51 0301;;;;N;;;1F5D;;1F5D 1F56;GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI;Ll;0;L;1F50 0342;;;;N;;;;; 1F57;GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI;Ll;0;L;1F51 0342;;;;N;;;1F5F;;1F5F 1F59;GREEK CAPITAL LETTER UPSILON WITH DASIA;Lu;0;L;03A5 0314;;;;N;;;;1F51; 1F5B;GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA;Lu;0;L;1F59 0300;;;;N;;;;1F53; 1F5D;GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA;Lu;0;L;1F59 0301;;;;N;;;;1F55; 1F5F;GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI;Lu;0;L;1F59 0342;;;;N;;;;1F57; } #$99,#$9B,#$9D,#$9F: new_c3 := chr(ord(c3) - 8); end; end; #$BE: begin // Start with a default and change for some particular chars case c3 of #$80..#$B9: if (ord(c3) mod $10) div 8 = 1 then new_c3 := chr(ord(c3) - 8); { 1FB0;GREEK SMALL LETTER ALPHA WITH VRACHY;Ll;0;L;03B1 0306;;;;N;;;1FB8;;1FB8 1FB1;GREEK SMALL LETTER ALPHA WITH MACRON;Ll;0;L;03B1 0304;;;;N;;;1FB9;;1FB9 1FB2;GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI;Ll;0;L;1F70 0345;;;;N;;;;; 1FB3;GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI;Ll;0;L;03B1 0345;;;;N;;;1FBC;;1FBC 1FB4;GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI;Ll;0;L;03AC 0345;;;;N;;;;; 1FB6;GREEK SMALL LETTER ALPHA WITH PERISPOMENI;Ll;0;L;03B1 0342;;;;N;;;;; 1FB7;GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI;Ll;0;L;1FB6 0345;;;;N;;;;; 1FB8;GREEK CAPITAL LETTER ALPHA WITH VRACHY;Lu;0;L;0391 0306;;;;N;;;;1FB0; 1FB9;GREEK CAPITAL LETTER ALPHA WITH MACRON;Lu;0;L;0391 0304;;;;N;;;;1FB1; 1FBA;GREEK CAPITAL LETTER ALPHA WITH VARIA;Lu;0;L;0391 0300;;;;N;;;;1F70; 1FBB;GREEK CAPITAL LETTER ALPHA WITH OXIA;Lu;0;L;0386;;;;N;;;;1F71; 1FBC;GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI;Lt;0;L;0391 0345;;;;N;;;;1FB3; 1FBD;GREEK KORONIS;Sk;0;ON; 0020 0313;;;;N;;;;; 1FBE;GREEK PROSGEGRAMMENI;Ll;0;L;03B9;;;;N;;;0399;;0399 1FBF;GREEK PSILI;Sk;0;ON; 0020 0313;;;;N;;;;; } #$BA: begin new_c2 := #$BD; new_c3 := #$B0; end; #$BB: begin new_c2 := #$BD; new_c3 := #$B1; end; #$BC: new_c3 := #$B3; end; end; end; if (CounterDiff <> 0) then begin OutStr^ := new_c1; OutStr[1] := new_c2; OutStr[2] := new_c3; end else begin if c1 <> new_c1 then OutStr^ := new_c1; if c2 <> new_c2 then OutStr[1] := new_c2; if c3 <> new_c3 then OutStr[2] := new_c3; end; inc(InStr, 3); inc(OutStr, 3); end; { More Characters with 3 bytes, so exotic stuff between: $2126..$2183 E2 84 A6..E2 86 83 $24B6..$24CF Result:=u+26; E2 92 B6..E2 93 8F $2C00..$2C2E Result:=u+48; E2 B0 80..E2 B0 AE $2C60..$2CE2 E2 B1 A0..E2 B3 A2 } #$E2: begin new_c1 := c1; c2 := InStr[1]; c3 := InStr[2]; new_c2 := c2; new_c3 := c3; // 2126;OHM SIGN;Lu;0;L;03A9;;;;N;OHM;;;03C9; E2 84 A6 => CF 89 if (c2 = #$84) and (c3 = #$A6) then begin inc(InStr, 3); OutStr^ := #$CF; inc(OutStr); OutStr^ := #$89; inc(OutStr); inc(CounterDiff, 1); Continue; end { 212A;KELVIN SIGN;Lu;0;L;004B;;;;N;DEGREES KELVIN;;;006B; E2 84 AA => 6B } else if (c2 = #$84) and (c3 = #$AA) then begin inc(InStr, 3); if c3 = #$AA then OutStr^ := #$6B else OutStr^ := #$E5; inc(OutStr); inc(CounterDiff, 2); Continue; end { 212B;ANGSTROM SIGN;Lu;0;L;00C5;;;;N;ANGSTROM UNIT;;;00E5; E2 84 AB => C3 A5 } else if (c2 = #$84) and (c3 = #$AB) then begin inc(InStr, 3); OutStr^ := #$C3; inc(OutStr); OutStr^ := #$A5; inc(OutStr); inc(CounterDiff, 1); Continue; end { 2160;ROMAN NUMERAL ONE;Nl;0;L; 0049;;;1;N;;;;2170; E2 85 A0 => E2 85 B0 2161;ROMAN NUMERAL TWO;Nl;0;L; 0049 0049;;;2;N;;;;2171; 2162;ROMAN NUMERAL THREE;Nl;0;L; 0049 0049 0049;;;3;N;;;;2172; 2163;ROMAN NUMERAL FOUR;Nl;0;L; 0049 0056;;;4;N;;;;2173; 2164;ROMAN NUMERAL FIVE;Nl;0;L; 0056;;;5;N;;;;2174; 2165;ROMAN NUMERAL SIX;Nl;0;L; 0056 0049;;;6;N;;;;2175; 2166;ROMAN NUMERAL SEVEN;Nl;0;L; 0056 0049 0049;;;7;N;;;;2176; 2167;ROMAN NUMERAL EIGHT;Nl;0;L; 0056 0049 0049 0049;;;8;N;;;;2177; 2168;ROMAN NUMERAL NINE;Nl;0;L; 0049 0058;;;9;N;;;;2178; 2169;ROMAN NUMERAL TEN;Nl;0;L; 0058;;;10;N;;;;2179; 216A;ROMAN NUMERAL ELEVEN;Nl;0;L; 0058 0049;;;11;N;;;;217A; 216B;ROMAN NUMERAL TWELVE;Nl;0;L; 0058 0049 0049;;;12;N;;;;217B; 216C;ROMAN NUMERAL FIFTY;Nl;0;L; 004C;;;50;N;;;;217C; 216D;ROMAN NUMERAL ONE HUNDRED;Nl;0;L; 0043;;;100;N;;;;217D; 216E;ROMAN NUMERAL FIVE HUNDRED;Nl;0;L; 0044;;;500;N;;;;217E; 216F;ROMAN NUMERAL ONE THOUSAND;Nl;0;L; 004D;;;1000;N;;;;217F; } else if (c2 = #$85) and (c3 in [#$A0..#$AF]) then new_c3 := chr(ord(c3) + $10) { 2183;ROMAN NUMERAL REVERSED ONE HUNDRED;Lu;0;L;;;;;N;;;;2184; E2 86 83 => E2 86 84 } else if (c2 = #$86) and (c3 = #$83) then new_c3 := chr(ord(c3) + 1) { $24B6..$24CF Result:=u+26; E2 92 B6..E2 93 8F Ex: 24B6;CIRCLED LATIN CAPITAL LETTER A;So;0;L; 0041;;;;N;;;;24D0; E2 92 B6 => E2 93 90 } else if (c2 = #$92) and (c3 in [#$B6..#$BF]) then begin new_c3 := #$93; new_c3 := chr(ord(c3) - $26); end else if (c2 = #$93) and (c3 in [#$80..#$8F]) then new_c3 := chr(ord(c3) + 26) { $2C00..$2C2E Result:=u+48; E2 B0 80..E2 B0 AE 2C00;GLAGOLITIC CAPITAL LETTER AZU;Lu;0;L;;;;;N;;;;2C30; E2 B0 80 => E2 B0 B0 2C10;GLAGOLITIC CAPITAL LETTER NASHI;Lu;0;L;;;;;N;;;;2C40; E2 B0 90 => E2 B1 80 } else if (c2 = #$B0) and (c3 in [#$80..#$8F]) then new_c3 := chr(ord(c3) + $30) else if (c2 = #$B0) and (c3 in [#$90..#$AE]) then begin new_c2 := #$B1; new_c3 := chr(ord(c3) - $10); end { $2C60..$2CE2 E2 B1 A0..E2 B3 A2 2C60;LATIN CAPITAL LETTER L WITH DOUBLE BAR;Lu;0;L;;;;;N;;;;2C61; E2 B1 A0 => +1 2C61;LATIN SMALL LETTER L WITH DOUBLE BAR;Ll;0;L;;;;;N;;;2C60;;2C60 2C62;LATIN CAPITAL LETTER L WITH MIDDLE TILDE;Lu;0;L;;;;;N;;;;026B; => C9 AB 2C63;LATIN CAPITAL LETTER P WITH STROKE;Lu;0;L;;;;;N;;;;1D7D; => E1 B5 BD 2C64;LATIN CAPITAL LETTER R WITH TAIL;Lu;0;L;;;;;N;;;;027D; => C9 BD 2C65;LATIN SMALL LETTER A WITH STROKE;Ll;0;L;;;;;N;;;023A;;023A 2C66;LATIN SMALL LETTER T WITH DIAGONAL STROKE;Ll;0;L;;;;;N;;;023E;;023E 2C67;LATIN CAPITAL LETTER H WITH DESCENDER;Lu;0;L;;;;;N;;;;2C68; => E2 B1 A8 2C68;LATIN SMALL LETTER H WITH DESCENDER;Ll;0;L;;;;;N;;;2C67;;2C67 2C69;LATIN CAPITAL LETTER K WITH DESCENDER;Lu;0;L;;;;;N;;;;2C6A; => E2 B1 AA 2C6A;LATIN SMALL LETTER K WITH DESCENDER;Ll;0;L;;;;;N;;;2C69;;2C69 2C6B;LATIN CAPITAL LETTER Z WITH DESCENDER;Lu;0;L;;;;;N;;;;2C6C; => E2 B1 AC 2C6C;LATIN SMALL LETTER Z WITH DESCENDER;Ll;0;L;;;;;N;;;2C6B;;2C6B 2C6D;LATIN CAPITAL LETTER ALPHA;Lu;0;L;;;;;N;;;;0251; => C9 91 2C6E;LATIN CAPITAL LETTER M WITH HOOK;Lu;0;L;;;;;N;;;;0271; => C9 B1 2C6F;LATIN CAPITAL LETTER TURNED A;Lu;0;L;;;;;N;;;;0250; => C9 90 2C70;LATIN CAPITAL LETTER TURNED ALPHA;Lu;0;L;;;;;N;;;;0252; => C9 92 } else if (c2 = #$B1) then begin case c3 of #$A0: new_c3 := chr(ord(c3)+1); #$A2,#$A4,#$AD..#$AF,#$B0: begin inc(InStr, 3); OutStr^ := #$C9; inc(OutStr); case c3 of #$A2: OutStr^ := #$AB; #$A4: OutStr^ := #$BD; #$AD: OutStr^ := #$90; #$AE: OutStr^ := #$B1; #$AF: OutStr^ := #$90; #$B0: OutStr^ := #$92; end; inc(OutStr); inc(CounterDiff, 1); Continue; end; #$A3: begin new_c2 := #$B5; new_c3 := #$BD; end; #$A7,#$A9,#$AB: new_c3 := chr(ord(c3)+1); { 2C71;LATIN SMALL LETTER V WITH RIGHT HOOK;Ll;0;L;;;;;N;;;;; 2C72;LATIN CAPITAL LETTER W WITH HOOK;Lu;0;L;;;;;N;;;;2C73; 2C73;LATIN SMALL LETTER W WITH HOOK;Ll;0;L;;;;;N;;;2C72;;2C72 2C74;LATIN SMALL LETTER V WITH CURL;Ll;0;L;;;;;N;;;;; 2C75;LATIN CAPITAL LETTER HALF H;Lu;0;L;;;;;N;;;;2C76; 2C76;LATIN SMALL LETTER HALF H;Ll;0;L;;;;;N;;;2C75;;2C75 2C77;LATIN SMALL LETTER TAILLESS PHI;Ll;0;L;;;;;N;;;;; 2C78;LATIN SMALL LETTER E WITH NOTCH;Ll;0;L;;;;;N;;;;; 2C79;LATIN SMALL LETTER TURNED R WITH TAIL;Ll;0;L;;;;;N;;;;; 2C7A;LATIN SMALL LETTER O WITH LOW RING INSIDE;Ll;0;L;;;;;N;;;;; 2C7B;LATIN LETTER SMALL CAPITAL TURNED E;Ll;0;L;;;;;N;;;;; 2C7C;LATIN SUBSCRIPT SMALL LETTER J;Ll;0;L; 006A;;;;N;;;;; 2C7D;MODIFIER LETTER CAPITAL V;Lm;0;L; 0056;;;;N;;;;; 2C7E;LATIN CAPITAL LETTER S WITH SWASH TAIL;Lu;0;L;;;;;N;;;;023F; => C8 BF 2C7F;LATIN CAPITAL LETTER Z WITH SWASH TAIL;Lu;0;L;;;;;N;;;;0240; => C9 80 } #$B2,#$B5: new_c3 := chr(ord(c3)+1); #$BE,#$BF: begin inc(InStr, 3); case c3 of #$BE: OutStr^ := #$C8; #$BF: OutStr^ := #$C9; end; OutStr^ := #$C8; inc(OutStr); case c3 of #$BE: OutStr^ := #$BF; #$BF: OutStr^ := #$80; end; inc(OutStr); inc(CounterDiff, 1); Continue; end; end; end { 2C80;COPTIC CAPITAL LETTER ALFA;Lu;0;L;;;;;N;;;;2C81; E2 B2 80 => E2 B2 81 ... 2CBE;COPTIC CAPITAL LETTER OLD COPTIC OOU;Lu;0;L;;;;;N;;;;2CBF; E2 B2 BE => E2 B2 BF 2CBF;COPTIC SMALL LETTER OLD COPTIC OOU;Ll;0;L;;;;;N;;;2CBE;;2CBE ... 2CC0;COPTIC CAPITAL LETTER SAMPI;Lu;0;L;;;;;N;;;;2CC1; E2 B3 80 => E2 B2 81 2CC1;COPTIC SMALL LETTER SAMPI;Ll;0;L;;;;;N;;;2CC0;;2CC0 ... 2CE2;COPTIC CAPITAL LETTER OLD NUBIAN WAU;Lu;0;L;;;;;N;;;;2CE3; E2 B3 A2 => E2 B3 A3 2CE3;COPTIC SMALL LETTER OLD NUBIAN WAU;Ll;0;L;;;;;N;;;2CE2;;2CE2 <= } else if (c2 = #$B2) then begin if ord(c3) mod 2 = 0 then new_c3 := chr(ord(c3) + 1); end else if (c2 = #$B3) and (c3 in [#$80..#$A3]) then begin if ord(c3) mod 2 = 0 then new_c3 := chr(ord(c3) + 1); end; if (CounterDiff <> 0) then begin OutStr^ := new_c1; OutStr[1] := new_c2; OutStr[2] := new_c3; end else begin if c1 <> new_c1 then OutStr^ := new_c1; if c2 <> new_c2 then OutStr[1] := new_c2; if c3 <> new_c3 then OutStr[2] := new_c3; end; inc(InStr, 3); inc(OutStr, 3); end; { FF21;FULLWIDTH LATIN CAPITAL LETTER A;Lu;0;L; 0041;;;;N;;;;FF41; EF BC A1 => EF BD 81 ... FF3A;FULLWIDTH LATIN CAPITAL LETTER Z;Lu;0;L; 005A;;;;N;;;;FF5A; EF BC BA => EF BD 9A } #$EF: begin c2 := InStr[1]; c3 := InStr[2]; if (c2 = #$BC) and (c3 in [#$A1..#$BA]) then begin OutStr^ := c1; OutStr[1] := #$BD; OutStr[2] := chr(ord(c3) - $20); end; if (CounterDiff <> 0) then begin OutStr^ := c1; OutStr[1] := c2; OutStr[2] := c3; end; inc(InStr, 3); inc(OutStr, 3); end; else // Copy the character if the string was disaligned by previous changes if (CounterDiff <> 0) then OutStr^:= c1; inc(InStr); inc(OutStr); end; // Case InStr^ end; // while // Final correction of the buffer size SetLength(Result,OutStr - PChar(Result)); end; { AInStr - The input string ALanguage - The language. Use '' for maximum speed if one desires to ignore the language The language should be specified in the format from ISO 639-1, which uses 2 characters to represent each language. If the language has no code in ISO 639-1, then the 3-chars code from ISO 639-2 should be used. Example: "tr" - Turkish language locale Data from here: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt The columns in the file UnicodeData.txt are explained here: http://www.ksu.ru/eng/departments/ktk/test/perl/lib/unicode/UCDFF301.html#Case Mappings } function UTF8UpperCase(const AInStr: string; ALanguage: string=''): string; var i, InCounter, OutCounter: PtrInt; OutStr: PChar; CharLen: integer; CharProcessed: Boolean; NewCharLen: integer; NewChar, OldChar: Word; // Language identification IsTurkish: Boolean; begin // Start with the same string, and progressively modify Result:=AInStr; UniqueString(Result); OutStr := PChar(Result); // Language identification IsTurkish := (ALanguage = 'tr') or (ALanguage = 'az'); // Turkish and Azeri have a special handling InCounter:=1; // for AInStr OutCounter := 0; // for Result while InCounter<=length(AInStr) do begin { First ASCII chars } if (AInStr[InCounter] <= 'z') and (AInStr[InCounter] >= 'a') then begin // Special turkish handling // small dotted i to capital dotted i if IsTurkish and (AInStr[InCounter] = 'i') then begin SetLength(Result,Length(Result)+1);// Increase the buffer OutStr := PChar(Result); OutStr[OutCounter]:=#$C4; OutStr[OutCounter+1]:=#$B0; inc(InCounter); inc(OutCounter,2); end else begin OutStr[OutCounter]:=chr(ord(AInStr[InCounter])-32); inc(InCounter); inc(OutCounter); end; end { Now everything else } else begin CharLen := UTF8CharacterLength(@AInStr[InCounter]); CharProcessed := False; NewCharLen := CharLen; if CharLen = 2 then begin OldChar := (Ord(AInStr[InCounter]) shl 8) or Ord(AInStr[InCounter+1]); NewChar := 0; // Major processing case OldChar of // Latin Characters 0000–0FFF http://en.wikibooks.org/wiki/Unicode/Character_reference/0000-0FFF $C39F: NewChar := $5353; // ß => SS $C3A0..$C3B6,$C3B8..$C3BE: NewChar := OldChar - $20; $C3BF: NewChar := $C5B8; // ÿ $C481..$C4B0: if OldChar mod 2 = 1 then NewChar := OldChar - 1; // 0130 = C4 B0 // turkish small undotted i to capital undotted i $C4B1: begin OutStr[OutCounter]:='I'; NewCharLen := 1; CharProcessed := True; end; $C4B2..$C4B7: if OldChar mod 2 = 1 then NewChar := OldChar - 1; // $C4B8: ĸ without upper/lower $C4B9..$C4BF: if OldChar mod 2 = 0 then NewChar := OldChar - 1; $C580: NewChar := $C4BF; // border between bytes $C581..$C588: if OldChar mod 2 = 0 then NewChar := OldChar - 1; // $C589 ʼn => ? $C58A..$C5B7: if OldChar mod 2 = 1 then NewChar := OldChar - 1; // $C5B8: // Ÿ already uppercase $C5B9..$C5BE: if OldChar mod 2 = 0 then NewChar := OldChar - 1; $C5BF: // 017F begin OutStr[OutCounter]:='S'; NewCharLen := 1; CharProcessed := True; end; // 0180 = C6 80 -> A convoluted part $C680: NewChar := $C983; $C682..$C685: if OldChar mod 2 = 1 then NewChar := OldChar - 1; $C688: NewChar := $C687; $C68C: NewChar := $C68B; // 0190 = C6 90 -> A convoluted part $C692: NewChar := $C691; $C695: NewChar := $C7B6; $C699: NewChar := $C698; $C69A: NewChar := $C8BD; $C69E: NewChar := $C8A0; // 01A0 = C6 A0 -> A convoluted part $C6A0..$C6A5: if OldChar mod 2 = 1 then NewChar := OldChar - 1; $C6A8: NewChar := $C6A7; $C6AD: NewChar := $C6AC; // 01B0 = C6 B0 $C6B0: NewChar := $C6AF; $C6B3..$C6B6: if OldChar mod 2 = 0 then NewChar := OldChar - 1; $C6B9: NewChar := $C6B8; $C6BD: NewChar := $C6BC; $C6BF: NewChar := $C7B7; // 01C0 = C7 80 $C784..$C786: NewChar := $C784; $C787..$C789: NewChar := $C787; $C78A..$C78C: NewChar := $C78A; $C78E: NewChar := $C78D; // 01D0 = C7 90 $C790: NewChar := $C78F; $C791..$C79C: if OldChar mod 2 = 0 then NewChar := OldChar - 1; $C79D: NewChar := $C68E; $C79F: NewChar := $C79E; // 01E0 = C7 A0 $C7A0..$C7AF: if OldChar mod 2 = 1 then NewChar := OldChar - 1; // 01F0 = C7 B0 $C7B2..$C7B3: NewChar := $C7B1; $C7B5: NewChar := $C7B4; $C7B8..$C7BF: if OldChar mod 2 = 1 then NewChar := OldChar - 1; // 0200 = C8 80 // 0210 = C8 90 $C880..$C89F: if OldChar mod 2 = 1 then NewChar := OldChar - 1; // 0220 = C8 A0 // 0230 = C8 B0 $C8A2..$C8B3: if OldChar mod 2 = 1 then NewChar := OldChar - 1; $C8BC: NewChar := $C8BB; $C8BF: begin OutStr[OutCounter] := #$E2; OutStr[OutCounter+1]:= #$B1; OutStr[OutCounter+2]:= #$BE; NewCharLen := 3; CharProcessed := True; end; // 0240 = C9 80 $C980: begin OutStr[OutCounter] := #$E2; OutStr[OutCounter+1]:= #$B1; OutStr[OutCounter+2]:= #$BF; NewCharLen := 3; CharProcessed := True; end; $C982: NewChar := $C981; $C986..$C98F: if OldChar mod 2 = 1 then NewChar := OldChar - 1; // 0250 = C9 90 $C990: begin OutStr[OutCounter] := #$E2; OutStr[OutCounter+1]:= #$B1; OutStr[OutCounter+2]:= #$AF; NewCharLen := 3; CharProcessed := True; end; $C991: begin OutStr[OutCounter] := #$E2; OutStr[OutCounter+1]:= #$B1; OutStr[OutCounter+2]:= #$AD; NewCharLen := 3; CharProcessed := True; end; $C992: begin OutStr[OutCounter] := #$E2; OutStr[OutCounter+1]:= #$B1; OutStr[OutCounter+2]:= #$B0; NewCharLen := 3; CharProcessed := True; end; $C993: NewChar := $C681; $C994: NewChar := $C686; $C996: NewChar := $C689; $C997: NewChar := $C68A; $C999: NewChar := $C68F; $C99B: NewChar := $C690; // 0260 = C9 A0 $C9A0: NewChar := $C693; $C9A3: NewChar := $C694; $C9A5: begin OutStr[OutCounter] := #$EA; OutStr[OutCounter+1]:= #$9E; OutStr[OutCounter+2]:= #$8D; NewCharLen := 3; CharProcessed := True; end; $C9A8: NewChar := $C697; $C9A9: NewChar := $C696; $C9AB: begin OutStr[OutCounter] := #$E2; OutStr[OutCounter+1]:= #$B1; OutStr[OutCounter+2]:= #$A2; NewCharLen := 3; CharProcessed := True; end; $C9AF: NewChar := $C69C; // 0270 = C9 B0 $C9B1: begin OutStr[OutCounter] := #$E2; OutStr[OutCounter+1]:= #$B1; OutStr[OutCounter+2]:= #$AE; NewCharLen := 3; CharProcessed := True; end; $C9B2: NewChar := $C69D; $C9B5: NewChar := $C69F; $C9BD: begin OutStr[OutCounter] := #$E2; OutStr[OutCounter+1]:= #$B1; OutStr[OutCounter+2]:= #$A4; NewCharLen := 3; CharProcessed := True; end; // 0280 = CA 80 $CA80: NewChar := $C6A6; $CA83: NewChar := $C6A9; $CA88: NewChar := $C6AE; $CA89: NewChar := $C984; $CA8A: NewChar := $C6B1; $CA8B: NewChar := $C6B2; $CA8C: NewChar := $C985; // 0290 = CA 90 $CA92: NewChar := $C6B7; { 03A0 = CE A0 03AC;GREEK SMALL LETTER ALPHA WITH TONOS;Ll;0;L;03B1 0301;;;;N;GREEK SMALL LETTER ALPHA TONOS;;0386;;0386 03AD;GREEK SMALL LETTER EPSILON WITH TONOS;Ll;0;L;03B5 0301;;;;N;GREEK SMALL LETTER EPSILON TONOS;;0388;;0388 03AE;GREEK SMALL LETTER ETA WITH TONOS;Ll;0;L;03B7 0301;;;;N;GREEK SMALL LETTER ETA TONOS;;0389;;0389 03AF;GREEK SMALL LETTER IOTA WITH TONOS;Ll;0;L;03B9 0301;;;;N;GREEK SMALL LETTER IOTA TONOS;;038A;;038A } $CEAC: NewChar := $CE86; $CEAD: NewChar := $CE88; $CEAE: NewChar := $CE89; $CEAF: NewChar := $CE8A; { 03B0 = CE B0 03B0;GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS;Ll;0;L;03CB 0301;;;;N;GREEK SMALL LETTER UPSILON DIAERESIS TONOS;;;; 03B1;GREEK SMALL LETTER ALPHA;Ll;0;L;;;;;N;;;0391;;0391 ... 03BF;GREEK SMALL LETTER OMICRON;Ll;0;L;;;;;N;;;039F;;039F } $CEB1..$CEBF: NewChar := OldChar - $20; // Greek Characters { 03C0 = CF 80 03C0;GREEK SMALL LETTER PI;Ll;0;L;;;;;N;;;03A0;;03A0 CF 80 => CE A0 03C1;GREEK SMALL LETTER RHO;Ll;0;L;;;;;N;;;03A1;;03A1 03C2;GREEK SMALL LETTER FINAL SIGMA;Ll;0;L;;;;;N;;;03A3;;03A3 03C3;GREEK SMALL LETTER SIGMA;Ll;0;L;;;;;N;;;03A3;;03A3 03C4;GREEK SMALL LETTER TAU;Ll;0;L;;;;;N;;;03A4;;03A4 .... 03CB;GREEK SMALL LETTER UPSILON WITH DIALYTIKA;Ll;0;L;03C5 0308;;;;N;GREEK SMALL LETTER UPSILON DIAERESIS;;03AB;;03AB 03CC;GREEK SMALL LETTER OMICRON WITH TONOS;Ll;0;L;03BF 0301;;;;N;GREEK SMALL LETTER OMICRON TONOS;;038C;;038C 03CD;GREEK SMALL LETTER UPSILON WITH TONOS;Ll;0;L;03C5 0301;;;;N;GREEK SMALL LETTER UPSILON TONOS;;038E;;038E 03CE;GREEK SMALL LETTER OMEGA WITH TONOS;Ll;0;L;03C9 0301;;;;N;GREEK SMALL LETTER OMEGA TONOS;;038F;;038F 03CF;GREEK CAPITAL KAI SYMBOL;Lu;0;L;;;;;N;;;;03D7; } $CF80,$CF81,$CF83..$CF8B: NewChar := OldChar - $E0; // Greek Characters $CF82: NewChar := $CEA3; $CF8C: NewChar := $CE8C; $CF8D: NewChar := $CE8E; $CF8E: NewChar := $CE8F; { 03D0 = CF 90 03D0;GREEK BETA SYMBOL;Ll;0;L; 03B2;;;;N;GREEK SMALL LETTER CURLED BETA;;0392;;0392 CF 90 => CE 92 03D1;GREEK THETA SYMBOL;Ll;0;L; 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398 => CE 98 03D5;GREEK PHI SYMBOL;Ll;0;L; 03C6;;;;N;GREEK SMALL LETTER SCRIPT PHI;;03A6;;03A6 => CE A6 03D6;GREEK PI SYMBOL;Ll;0;L; 03C0;;;;N;GREEK SMALL LETTER OMEGA PI;;03A0;;03A0 => CE A0 03D7;GREEK KAI SYMBOL;Ll;0;L;;;;;N;;;03CF;;03CF => CF 8F 03D9;GREEK SMALL LETTER ARCHAIC KOPPA;Ll;0;L;;;;;N;;;03D8;;03D8 03DB;GREEK SMALL LETTER STIGMA;Ll;0;L;;;;;N;;;03DA;;03DA 03DD;GREEK SMALL LETTER DIGAMMA;Ll;0;L;;;;;N;;;03DC;;03DC 03DF;GREEK SMALL LETTER KOPPA;Ll;0;L;;;;;N;;;03DE;;03DE } $CF90: NewChar := $CE92; $CF91: NewChar := $CE98; $CF95: NewChar := $CEA6; $CF96: NewChar := $CEA0; $CF97: NewChar := $CF8F; $CF99..$CF9F: if OldChar mod 2 = 1 then NewChar := OldChar - 1; // 03E0 = CF A0 $CFA0..$CFAF: if OldChar mod 2 = 1 then NewChar := OldChar - 1; { 03F0 = CF B0 03F0;GREEK KAPPA SYMBOL;Ll;0;L; 03BA;;;;N;GREEK SMALL LETTER SCRIPT KAPPA;;039A;;039A => CE 9A 03F1;GREEK RHO SYMBOL;Ll;0;L; 03C1;;;;N;GREEK SMALL LETTER TAILED RHO;;03A1;;03A1 => CE A1 03F2;GREEK LUNATE SIGMA SYMBOL;Ll;0;L; 03C2;;;;N;GREEK SMALL LETTER LUNATE SIGMA;;03F9;;03F9 03F5;GREEK LUNATE EPSILON SYMBOL;Ll;0;L; 03B5;;;;N;;;0395;;0395 => CE 95 03F8;GREEK SMALL LETTER SHO;Ll;0;L;;;;;N;;;03F7;;03F7 03FB;GREEK SMALL LETTER SAN;Ll;0;L;;;;;N;;;03FA;;03FA } $CFB0: NewChar := $CE9A; $CFB1: NewChar := $CEA1; $CFB2: NewChar := $CFB9; $CFB5: NewChar := $CE95; $CFB8: NewChar := $CFB7; $CFBB: NewChar := $CFBA; // 0400 = D0 80 ... 042F everything already uppercase // 0430 = D0 B0 $D0B0..$D0BF: NewChar := OldChar - $20; // Cyrillic alphabet // 0440 = D1 80 $D180..$D18F: NewChar := OldChar - $E0; // Cyrillic alphabet // 0450 = D1 90 $D190..$D19F: NewChar := OldChar - $110; // Cyrillic alphabet end; if NewChar <> 0 then begin OutStr[OutCounter] := Chr(Hi(NewChar)); OutStr[OutCounter+1]:= Chr(Lo(NewChar)); CharProcessed := True; end; end; // Copy the character if the string was disaligned by previous changed // and no processing was done in this character if (InCounter <> OutCounter+1) and (not CharProcessed) then begin for i := 0 to CharLen-1 do OutStr[OutCounter+i] :=AInStr[InCounter+i]; end; inc(InCounter, CharLen); inc(OutCounter, NewCharLen); end; end; // while // Final correction of the buffer size SetLength(Result,OutCounter); end; function FindInvalidUTF8Character(p: PChar; Count: PtrInt; StopOnNonASCII: Boolean): PtrInt; // return -1 if ok var CharLen: Integer; begin if (p<>nil) then begin Result:=0; while ResultCount then begin dec(Result,CharLen); exit; // missing following bytes end; end; end; // ok Result:=-1; end; function ValidUTF8String(const s: String): String; var p, cur: PChar; l, lr: integer; NeedFree: Boolean; begin if FindInvalidUTF8Character(PChar(s), Length(s)) <> -1 then begin NeedFree := True; GetMem(p, Length(s) + 1); StrPCopy(p, s); UTF8FixBroken(p); end else begin p := PChar(s); NeedFree := False; end; Result := ''; cur := p; while cur^ <> #0 do begin l := UTF8CharacterLength(cur); if (l = 1) and (cur^ < #32) then Result := Result + '#' + IntToStr(Ord(cur^)) else begin lr := Length(Result); SetLength(Result, lr + l); System.Move(cur^, Result[lr + 1], l); end; inc(cur, l) end; if NeedFree then FreeMem(p); end; function UTF8Trim(const s: string; Flags: TUTF8TrimFlags): string; var p: PChar; u: Cardinal; StartP: PtrUInt; l: Integer; KeepAllNonASCII: boolean; begin Result:=s; if Result='' then exit; KeepAllNonASCII:=[u8tKeepControlCodes,u8tKeepNoBreakSpaces]*Flags=[u8tKeepControlCodes,u8tKeepNoBreakSpaces]; if not (u8tKeepStart in Flags) then begin // trim start p:=PChar(Result); repeat l:=1; case p^ of #0: if p-PChar(Result)=length(Result) then begin // everything was trimmed exit('') end else if u8tKeepControlCodes in Flags then break; ' ': ; #10,#13: if u8tKeepLineBreaks in Flags then break; #9: if u8tKeepTabs in Flags then break; #1..#8,#11,#12,#14..#31,#127: if u8tKeepControlCodes in Flags then break; #128..#255: begin if KeepAllNonASCII then break; u:=UTF8CharacterToUnicode(p,l); if (l<=1) then break; // invalid character case u of 128..159, // C1 set of control codes 8206, 8207: // left-to-right, right-to-left mark if u8tKeepControlCodes in Flags then break; 160, // no break space $2007, // figure space $2026, // narrow no-break space $FEFF: // zero with no-break space if u8tKeepNoBreakSpaces in Flags then break; else break; end; end; else break; end; inc(p,l); until false; if p>PChar(Result) then begin Result:=copy(Result,p-PChar(Result)+1,length(Result)); if Result='' then exit; end; end; if not (u8tKeepEnd in Flags) then begin // trim end p:=@Result[length(Result)]; repeat case p^ of #0: if u8tKeepControlCodes in Flags then break; ' ': ; #10,#13: if u8tKeepLineBreaks in Flags then break; #9: if u8tKeepTabs in Flags then break; #1..#8,#11,#12,#14..#31,#127: if u8tKeepControlCodes in Flags then break; #128..#255: begin if KeepAllNonASCII then break; StartP:=UTF8FindNearestCharStart(PChar(Result),length(Result),p-PChar(Result)); u:=UTF8CharacterToUnicode(PChar(Result)+StartP,l); if (l<=1) then break; // invalid character case u of 128..159, // C1 set of control codes 8206, 8207: // left-to-right, right-to-left mark if u8tKeepControlCodes in Flags then break; 160, // no break space $2007, // figure space $2026, // narrow no-break space $FEFF: // zero with no-break space if u8tKeepNoBreakSpaces in Flags then break; else break; end; p:=PChar(Result)+StartP; end; else break; end; dec(p); until p 0 if S2 > S1. Compare 2 UTF8 encoded strings, case sensitive. Note: Use this function instead of AnsiCompareStr. This function guarantees proper collation on all supported platforms. ------------------------------------------------------------------------------} function UTF8CompareStr(const S1, S2: string): Integer; begin Result := SysUtils.CompareStr(S1, S2); end; {------------------------------------------------------------------------------ Name: UTF8CompareText Params: S1, S2 - UTF8 encoded strings Returns: < 0 if S1 < S2, 0 if S1 = S2, > 0 if S2 > S1. Compare 2 UTF8 encoded strings, case insensitive. Note: Use this function instead of AnsiCompareText. This function guarantees proper collation on all supported platforms. ------------------------------------------------------------------------------} function UTF8CompareText(const S1, S2: string): Integer; var S1Lower, S2Lower: string; begin S1Lower := UTF8LowerCase(S1); S2Lower := UTF8LowerCase(S2); Result := UTF8CompareStr(S1Lower, S2Lower); end; function CompareStrListUTF8LowerCase(List: TStringList; Index1, Index2: Integer ): Integer; begin Result:=UTF8CompareText(List[Index1],List[Index2]); end; {------------------------------------------------------------------------------ Name: ConvertUTF8ToUTF16 Params: Dest - Pointer to destination string DestWideCharCount - Wide char count allocated in destination string Src - Pointer to source string SrcCharCount - Char count allocated in source string Options - Conversion options, if none is set, both invalid and unfinished source chars are skipped toInvalidCharError - Stop on invalid source char and report error toInvalidCharToSymbol - Replace invalid source chars with '?' toUnfinishedCharError - Stop on unfinished source char and report error toUnfinishedCharToSymbol - Replace unfinished source char with '?' ActualWideCharCount - Actual wide char count converted from source string to destination string Returns: trNoError - The string was successfully converted without any error trNullSrc - Pointer to source string is nil trNullDest - Pointer to destination string is nil trDestExhausted - Destination buffer size is not big enough to hold converted string trInvalidChar - Invalid source char has occured trUnfinishedChar - Unfinished source char has occured Converts the specified UTF-8 encoded string to UTF-16 encoded (system endian) ------------------------------------------------------------------------------} function ConvertUTF8ToUTF16(Dest: PWideChar; DestWideCharCount: SizeUInt; Src: PChar; SrcCharCount: SizeUInt; Options: TConvertOptions; out ActualWideCharCount: SizeUInt): TConvertResult; var DestI, SrcI: SizeUInt; B1, B2, B3, B4: Byte; W: Word; C: Cardinal; function UnfinishedCharError: Boolean; begin if toUnfinishedCharToSymbol in Options then begin Dest[DestI] := System.WideChar('?'); Inc(DestI); Result := False; end else if toUnfinishedCharError in Options then begin ConvertUTF8ToUTF16 := trUnfinishedChar; Result := True; end else Result := False; end; function InvalidCharError(Count: SizeUInt): Boolean; inline; begin if not (toInvalidCharError in Options) then begin if toInvalidCharToSymbol in Options then begin Dest[DestI] := System.WideChar('?'); Inc(DestI); end; Dec(SrcI, Count); // skip trailing UTF-8 char bytes while (Count > 0) do begin if (Byte(Src[SrcI]) and %11000000) <> %10000000 then Break; Inc(SrcI); Dec(Count); end; Result := False; end else if toInvalidCharError in Options then begin ConvertUTF8ToUTF16 := trUnfinishedChar; Result := True; end; end; begin ActualWideCharCount := 0; if not Assigned(Src) then begin Result := trNullSrc; Exit; end; if not Assigned(Dest) then begin Result := trNullDest; Exit; end; SrcI := 0; DestI := 0; while (DestI < DestWideCharCount) and (SrcI < SrcCharCount) do begin B1 := Byte(Src[SrcI]); Inc(SrcI); if B1 < 128 then // single byte UTF-8 char begin Dest[DestI] := System.WideChar(B1); Inc(DestI); end else begin if SrcI >= SrcCharCount then if UnfinishedCharError then Exit(trInvalidChar) else Break; B2 := Byte(Src[SrcI]); Inc(SrcI); if (B1 and %11100000) = %11000000 then // double byte UTF-8 char begin if (B2 and %11000000) = %10000000 then begin Dest[DestI] := System.WideChar(((B1 and %00011111) shl 6) or (B2 and %00111111)); Inc(DestI); end else // invalid character, assume single byte UTF-8 char if InvalidCharError(1) then Exit(trInvalidChar); end else begin if SrcI >= SrcCharCount then if UnfinishedCharError then Exit(trInvalidChar) else Break; B3 := Byte(Src[SrcI]); Inc(SrcI); if (B1 and %11110000) = %11100000 then // triple byte UTF-8 char begin if ((B2 and %11000000) = %10000000) and ((B3 and %11000000) = %10000000) then begin W := ((B1 and %00011111) shl 12) or ((B2 and %00111111) shl 6) or (B3 and %00111111); if (W < $D800) or (W > $DFFF) then // to single wide char UTF-16 char begin Dest[DestI] := System.WideChar(W); Inc(DestI); end else // invalid UTF-16 character, assume double byte UTF-8 char if InvalidCharError(2) then Exit(trInvalidChar); end else // invalid character, assume double byte UTF-8 char if InvalidCharError(2) then Exit(trInvalidChar); end else begin if SrcI >= SrcCharCount then if UnfinishedCharError then Exit(trInvalidChar) else Break; B4 := Byte(Src[SrcI]); Inc(SrcI); if ((B1 and %11111000) = %11110000) and ((B2 and %11000000) = %10000000) and ((B3 and %11000000) = %10000000) and ((B4 and %11000000) = %10000000) then begin // 4 byte UTF-8 char C := ((B1 and %00011111) shl 18) or ((B2 and %00111111) shl 12) or ((B3 and %00111111) shl 6) or (B4 and %00111111); // to double wide char UTF-16 char Dest[DestI] := System.WideChar($D800 or ((C - $10000) shr 10)); Inc(DestI); if DestI >= DestWideCharCount then Break; Dest[DestI] := System.WideChar($DC00 or ((C - $10000) and %0000001111111111)); Inc(DestI); end else // invalid character, assume triple byte UTF-8 char if InvalidCharError(3) then Exit(trInvalidChar); end; end; end; end; if DestI >= DestWideCharCount then begin DestI := DestWideCharCount - 1; Result := trDestExhausted; end else Result := trNoError; Dest[DestI] := #0; ActualWideCharCount := DestI + 1; end; {------------------------------------------------------------------------------ Name: ConvertUTF16ToUTF8 Params: Dest - Pointer to destination string DestCharCount - Char count allocated in destination string Src - Pointer to source string SrcWideCharCount - Wide char count allocated in source string Options - Conversion options, if none is set, both invalid and unfinished source chars are skipped. See ConvertUTF8ToUTF16 for details. ActualCharCount - Actual char count converted from source string to destination string Returns: See ConvertUTF8ToUTF16 Converts the specified UTF-16 encoded string (system endian) to UTF-8 encoded ------------------------------------------------------------------------------} function ConvertUTF16ToUTF8(Dest: PChar; DestCharCount: SizeUInt; Src: PWideChar; SrcWideCharCount: SizeUInt; Options: TConvertOptions; out ActualCharCount: SizeUInt): TConvertResult; var DestI, SrcI: SizeUInt; W1, W2: Word; C: Cardinal; function UnfinishedCharError: Boolean; begin if toUnfinishedCharToSymbol in Options then begin Dest[DestI] := Char('?'); Inc(DestI); Result := False; end else if toUnfinishedCharError in Options then begin ConvertUTF16ToUTF8 := trUnfinishedChar; Result := True; end else Result := False; end; function InvalidCharError(Count: SizeUInt): Boolean; inline; begin if not (toInvalidCharError in Options) then begin if toInvalidCharToSymbol in Options then begin Dest[DestI] := Char('?'); Inc(DestI); end; Dec(SrcI, Count); // skip trailing UTF-16 wide char if (Word(Src[SrcI]) and $FC00) = $DC00 then Inc(SrcI); Result := False; end else if toInvalidCharError in Options then begin ConvertUTF16ToUTF8 := trUnfinishedChar; Result := True; end; end; begin ActualCharCount := 0; if not Assigned(Src) then begin Result := trNullSrc; Exit; end; if not Assigned(Dest) then begin Result := trNullDest; Exit; end; SrcI := 0; DestI := 0; while (DestI < DestCharCount) and (SrcI < SrcWideCharCount) do begin W1 := Word(Src[SrcI]); Inc(SrcI); if (W1 < $D800) or (W1 > $DFFF) then // single wide char UTF-16 char begin if W1 < $0080 then // to single byte UTF-8 char begin Dest[DestI] := Char(W1); Inc(DestI); end else if W1 < $0800 then // to double byte UTF-8 char begin Dest[DestI] := Char(%11000000 or ((W1 and %11111000000) shr 6)); Inc(DestI); if DestI >= DestCharCount then Break; Dest[DestI] := Char(%10000000 or (W1 and %111111)); Inc(DestI); end else begin // to triple byte UTF-8 char Dest[DestI] := Char(%11100000 or ((W1 and %1111000000000000) shr 12)); Inc(DestI); if DestI >= DestCharCount then Break; Dest[DestI] := Char(%10000000 or ((W1 and %111111000000) shr 6)); Inc(DestI); if DestI >= DestCharCount then Break; Dest[DestI] := Char(%10000000 or (W1 and %111111)); Inc(DestI); end; end else begin if SrcI >= SrcWideCharCount then if UnfinishedCharError then Exit(trInvalidChar) else Break; W2 := Word(Src[SrcI]); Inc(SrcI); if (W1 and $F800) = $D800 then // double wide char UTF-16 char begin if (W2 and $FC00) = $DC00 then begin C := (W1 - $D800) shl 10 + (W2 - $DC00) + $10000; // to 4 byte UTF-8 char Dest[DestI] := Char(%11110000 or (C shr 18)); Inc(DestI); if DestI >= DestCharCount then Break; Dest[DestI] := Char(%10000000 or ((C and $3F000) shr 12)); Inc(DestI); if DestI >= DestCharCount then Break; Dest[DestI] := Char(%10000000 or ((C and %111111000000) shr 6)); Inc(DestI); if DestI >= DestCharCount then Break; Dest[DestI] := Char(%10000000 or (C and %111111)); Inc(DestI); end else // invalid character, assume single wide char UTF-16 char if InvalidCharError(1) then Exit(trInvalidChar); end else // invalid character, assume single wide char UTF-16 char if InvalidCharError(1) then Exit(trInvalidChar); end; end; if DestI >= DestCharCount then begin DestI := DestCharCount - 1; Result := trDestExhausted; end else Result := trNoError; Dest[DestI] := #0; ActualCharCount := DestI + 1; end; {------------------------------------------------------------------------------ Name: UTF8ToUTF16 Params: S - Source UTF-8 string Returns: UTF-16 encoded string Converts the specified UTF-8 encoded string to UTF-16 encoded (system endian) Avoid copying the result string since on windows a widestring requires a full copy ------------------------------------------------------------------------------} function UTF8ToUTF16(const S: AnsiString): UnicodeString; var L: SizeUInt; begin if S = '' then begin Result := ''; Exit; end; SetLength(Result, Length(S)); // wide chars of UTF-16 <= bytes of UTF-8 string if ConvertUTF8ToUTF16(PWideChar(Result), Length(Result) + 1, PChar(S), Length(S), [toInvalidCharToSymbol], L) = trNoError then SetLength(Result, L - 1) else Result := ''; end; {------------------------------------------------------------------------------ Name: UTF16ToUTF8 Params: S - Source UTF-16 string (system endian) Returns: UTF-8 encoded string Converts the specified UTF-16 encoded string (system endian) to UTF-8 encoded ------------------------------------------------------------------------------} function UTF16ToUTF8(const S: UnicodeString): AnsiString; var L: SizeUInt; R: AnsiString; begin Result := ''; if S = '' then Exit; SetLength(R, Length(S) * 3); // bytes of UTF-8 <= 3 * wide chars of UTF-16 string // e.g. %11100000 10100000 10000000 (UTF-8) is $0800 (UTF-16) if ConvertUTF16ToUTF8(PChar(R), Length(R) + 1, PWideChar(S), Length(S), [toInvalidCharToSymbol], L) = trNoError then begin SetLength(R, L - 1); Result := R; end; end; procedure LazGetLanguageIDs(var Lang, FallbackLang: String); {$IFDEF DARWIN} function GetLanguage: boolean; var Ref: CFStringRef; LangArray: CFMutableArrayRef; StrSize: CFIndex; StrRange: CFRange; Locals: CFArrayRef; Bundle: CFBundleRef; begin Result := false; Bundle:=CFBundleGetMainBundle; if Bundle=nil then exit; Locals:=CFBundleCopyBundleLocalizations(Bundle); if Locals=nil then exit; LangArray := CFBundleCopyLocalizationsForPreferences(Locals, nil); try if CFArrayGetCount(LangArray) > 0 then begin Ref := CFArrayGetValueAtIndex(LangArray, 0); StrRange.location := 0; StrRange.length := CFStringGetLength(Ref); CFStringGetBytes(Ref, StrRange, kCFStringEncodingUTF8, Ord('?'), False, nil, 0, StrSize); SetLength(Lang, StrSize); if StrSize > 0 then begin CFStringGetBytes(Ref, StrRange, kCFStringEncodingUTF8, Ord('?'), False, @Lang[1], StrSize, StrSize); Result:=true; FallbackLang := Copy(Lang, 1, 2); end; end; finally CFRelease(LangArray); CFRelease(Locals); end; end; {$ENDIF} begin {$IFDEF DARWIN} if not GetLanguage then GetLanguageIDs(Lang, FallbackLang); {$ELSE} GetLanguageIDs(Lang, FallbackLang); {$ENDIF} end; { This routine will strip country information from the language ID making it more simple Ideally the resulting ID from here should conform to ISO 639-1 or ISO 639-2, if the language has no code in ISO 639-1 } procedure LazGetShortLanguageID(var Lang: String); var FallbackLang: String; begin FallbackLang:=''; LazGetLanguageIDs(Lang, FallbackLang); // Simply making sure its length is at most 2 should be enough for most languages if Length(Lang) > 2 then Lang := Lang[1] + Lang[2]; end; procedure InternalInit; var c: Char; begin for c:=Low(char) to High(char) do begin FPUpChars[c]:=upcase(c); end; end; initialization InternalInit; end.