mirror of
				https://gitlab.com/freepascal.org/lazarus/lazarus.git
				synced 2025-10-31 06:41:31 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			3136 lines
		
	
	
		
			101 KiB
		
	
	
	
		
			ObjectPascal
		
	
	
	
	
	
			
		
		
	
	
			3136 lines
		
	
	
		
			101 KiB
		
	
	
	
		
			ObjectPascal
		
	
	
	
	
	
| {
 | ||
|  /***************************************************************************
 | ||
|                                   lazutf8.pas
 | ||
|  ***************************************************************************/
 | ||
| 
 | ||
|  *****************************************************************************
 | ||
|   This file is part of LazUtils
 | ||
| 
 | ||
|   See the file COPYING.modifiedLGPL.txt, included in this distribution,
 | ||
|   for details about the license.
 | ||
|  *****************************************************************************
 | ||
| 
 | ||
|   Useful routines for managing UTF-8 strings
 | ||
| 
 | ||
|   - all functions are thread safe unless explicitely stated
 | ||
| }
 | ||
| unit LazUTF8;
 | ||
| 
 | ||
| {$mode objfpc}{$H+}
 | ||
| 
 | ||
| interface
 | ||
| 
 | ||
| uses
 | ||
| {$ifdef windows}
 | ||
|   Windows,
 | ||
| {$endif}
 | ||
|   Classes, SysUtils; 
 | ||
| 
 | ||
| // AnsiToUTF8 and UTF8ToAnsi need a widestring manager under Linux, BSD, MacOSX
 | ||
| // but normally these OS use UTF-8 as system encoding so the widestringmanager
 | ||
| // is not needed.
 | ||
| function NeedRTLAnsi: boolean;// true if system encoding is not UTF-8
 | ||
| procedure SetNeedRTLAnsi(NewValue: boolean);
 | ||
| function UTF8ToSys(const s: string): string;// as UTF8ToAnsi but more independent of widestringmanager
 | ||
| function SysToUTF8(const s: string): string;// as AnsiToUTF8 but more independent of widestringmanager
 | ||
| function ConsoleToUTF8(const s: string): string;// converts OEM encoded string to UTF8 (used with some Windows specific functions)
 | ||
| function UTF8ToConsole(const s: string): string;// converts UTF8 string to console encoding (used by Write, WriteLn)
 | ||
| 
 | ||
| function ParamStrUTF8(Param: Integer): string;
 | ||
| 
 | ||
| function GetEnvironmentStringUTF8(Index: Integer): string;
 | ||
| function GetEnvironmentVariableUTF8(const EnvVar: string): String;
 | ||
| 
 | ||
| function SysErrorMessageUTF8(ErrorCode: Integer): String;
 | ||
| 
 | ||
| 
 | ||
| function UTF8CharacterLength(p: PChar): integer;
 | ||
| function UTF8Length(const s: string): PtrInt;
 | ||
| function UTF8Length(p: PChar; ByteCount: PtrInt): PtrInt;
 | ||
| function UTF8CharacterToUnicode(p: PChar; out CharLen: integer): Cardinal;
 | ||
| function UnicodeToUTF8(CodePoint: cardinal): string;
 | ||
| function UnicodeToUTF8(CodePoint: cardinal; Buf: PChar): integer;
 | ||
| function UnicodeToUTF8SkipErrors(CodePoint: cardinal; Buf: PChar): integer;
 | ||
| function UnicodeToUTF8Inline(CodePoint: cardinal; Buf: PChar): integer; inline;
 | ||
| function UTF8ToDoubleByteString(const s: string): string;
 | ||
| function UTF8ToDoubleByte(UTF8Str: PChar; Len: PtrInt; DBStr: PByte): PtrInt;
 | ||
| function UTF8FindNearestCharStart(UTF8Str: PChar; Len: integer;
 | ||
|                                   BytePos: integer): integer;
 | ||
| // find the n-th UTF8 character, ignoring BIDI
 | ||
| function UTF8CharStart(UTF8Str: PChar; Len, CharIndex: PtrInt): PChar;
 | ||
| // find the byte index of the n-th UTF8 character, ignoring BIDI (byte len of substr)
 | ||
| function UTF8CharToByteIndex(UTF8Str: PChar; Len, CharIndex: PtrInt): PtrInt;
 | ||
| procedure UTF8FixBroken(P: PChar); overload;
 | ||
| procedure UTF8FixBroken(var S: string); overload;
 | ||
| function UTF8CharacterStrictLength(P: PChar): integer;
 | ||
| function UTF8CStringToUTF8String(SourceStart: PChar; SourceLen: PtrInt) : string;
 | ||
| function UTF8Pos(const SearchForText, SearchInText: string; StartPos: SizeInt = 1): PtrInt;
 | ||
| function UTF8PosP(SearchForText: PChar; SearchForTextLen: SizeInt;
 | ||
|   SearchInText: PChar; SearchInTextLen: SizeInt): PChar;
 | ||
| function UTF8Copy(const s: string; StartCharIndex, CharCount: PtrInt): string;
 | ||
| procedure UTF8Delete(var s: String; StartCharIndex, CharCount: PtrInt);
 | ||
| procedure UTF8Insert(const source: String; var s: string; StartCharIndex: PtrInt);
 | ||
| 
 | ||
| function UTF8LowerCase(const AInStr: string; ALanguage: string=''): string;
 | ||
| function UTF8UpperCase(const AInStr: string; ALanguage: string=''): string;
 | ||
| function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
 | ||
|                                   StopOnNonASCII: Boolean = false): PtrInt;
 | ||
| function ValidUTF8String(const s: String): String;
 | ||
| 
 | ||
| type
 | ||
|   TUTF8TrimFlag = (
 | ||
|     u8tKeepStart,
 | ||
|     u8tKeepEnd,
 | ||
|     u8tKeepTabs,
 | ||
|     u8tKeepLineBreaks,
 | ||
|     u8tKeepNoBreakSpaces,
 | ||
|     u8tKeepControlCodes // excluding tabs and line breaks
 | ||
|     );
 | ||
|   TUTF8TrimFlags = set of TUTF8TrimFlag;
 | ||
| function UTF8Trim(const s: string; Flags: TUTF8TrimFlags = []): string;
 | ||
| 
 | ||
| procedure AssignUTF8ListToAnsi(UTF8List, AnsiList: TStrings);
 | ||
| 
 | ||
| //compare functions
 | ||
| 
 | ||
| function UTF8CompareStr(const S1, S2: string): Integer; inline;
 | ||
| function UTF8CompareStrP(S1, S2: PChar): Integer;
 | ||
| function UTF8CompareStr(S1: PChar; Count1: SizeInt; S2: PChar; Count2: SizeInt): Integer;
 | ||
| function UTF8CompareText(const S1, S2: string): Integer;
 | ||
| function UTF8CompareStrCollated(const S1, S2: string): Integer;
 | ||
| function CompareStrListUTF8LowerCase(List: TStringList; Index1, Index2: Integer): Integer;
 | ||
| 
 | ||
| type
 | ||
|   TConvertResult = (trNoError, trNullSrc, trNullDest, trDestExhausted,
 | ||
|     trInvalidChar, trUnfinishedChar);
 | ||
| 
 | ||
|   TConvertOption = (toInvalidCharError, toInvalidCharToSymbol,
 | ||
|     toUnfinishedCharError, toUnfinishedCharToSymbol);
 | ||
|   TConvertOptions = set of TConvertOption;
 | ||
| 
 | ||
| function ConvertUTF8ToUTF16(Dest: PWideChar; DestWideCharCount: SizeUInt;
 | ||
|   Src: PChar; SrcCharCount: SizeUInt; Options: TConvertOptions;
 | ||
|   out ActualWideCharCount: SizeUInt): TConvertResult;
 | ||
| 
 | ||
| function ConvertUTF16ToUTF8(Dest: PChar; DestCharCount: SizeUInt;
 | ||
|   Src: PWideChar; SrcWideCharCount: SizeUInt; Options: TConvertOptions;
 | ||
|   out ActualCharCount: SizeUInt): TConvertResult;
 | ||
| 
 | ||
| function UTF8ToUTF16(const S: AnsiString): UnicodeString;
 | ||
| function UTF16ToUTF8(const S: UnicodeString): AnsiString;
 | ||
| 
 | ||
| // locale
 | ||
| procedure LazGetLanguageIDs(var Lang, FallbackLang: String);
 | ||
| procedure LazGetShortLanguageID(var Lang: String);
 | ||
| 
 | ||
| var
 | ||
|   FPUpChars: array[char] of char;
 | ||
| 
 | ||
| implementation
 | ||
| 
 | ||
| uses
 | ||
|   gettext
 | ||
| {$IFDEF Darwin}, MacOSAll{$ENDIF}
 | ||
|   ;
 | ||
| 
 | ||
| {$ifdef windows}
 | ||
|   {$i winlazutf8.inc}
 | ||
| {$else}
 | ||
|   {$i unixlazutf8.inc}
 | ||
| {$endif}
 | ||
| 
 | ||
| var
 | ||
|   FNeedRTLAnsi: boolean = false;
 | ||
|   FNeedRTLAnsiValid: boolean = false;
 | ||
| 
 | ||
| function NeedRTLAnsi: boolean;
 | ||
| {$IFDEF WinCE}
 | ||
| // CP_UTF8 is missing in the windows unit of the Windows CE RTL
 | ||
| const
 | ||
|   CP_UTF8 = 65001;
 | ||
| {$ENDIF}
 | ||
| {$IFNDEF Windows}
 | ||
| var
 | ||
|   Lang: String;
 | ||
|   i: LongInt;
 | ||
|   Encoding: String;
 | ||
| {$ENDIF}
 | ||
| begin
 | ||
|   if FNeedRTLAnsiValid then
 | ||
|     exit(FNeedRTLAnsi);
 | ||
|   {$IFDEF Windows}
 | ||
|   FNeedRTLAnsi:=GetACP<>CP_UTF8;
 | ||
|   {$ELSE}
 | ||
|   FNeedRTLAnsi:=false;
 | ||
|   Lang := SysUtils.GetEnvironmentVariable('LC_ALL');
 | ||
|   if lang = '' then
 | ||
|   begin
 | ||
|     Lang := SysUtils.GetEnvironmentVariable('LC_MESSAGES');
 | ||
|     if Lang = '' then
 | ||
|     begin
 | ||
|       Lang := SysUtils.GetEnvironmentVariable('LANG');
 | ||
|     end;
 | ||
|   end;
 | ||
|   i:=System.Pos('.',Lang);
 | ||
|   if (i>0) then begin
 | ||
|     Encoding:=copy(Lang,i+1,length(Lang)-i);
 | ||
|     FNeedRTLAnsi:=(SysUtils.CompareText(Encoding,'UTF-8')<>0)
 | ||
|               and (SysUtils.CompareText(Encoding,'UTF8')<>0);
 | ||
|   end;
 | ||
|   {$ENDIF}
 | ||
|   FNeedRTLAnsiValid:=true;
 | ||
|   Result:=FNeedRTLAnsi;
 | ||
| end;
 | ||
| 
 | ||
| procedure SetNeedRTLAnsi(NewValue: boolean);
 | ||
| begin
 | ||
|   FNeedRTLAnsi:=NewValue;
 | ||
|   FNeedRTLAnsiValid:=true;
 | ||
| end;
 | ||
| 
 | ||
| function IsASCII(const s: string): boolean; inline;
 | ||
| var
 | ||
|   i: Integer;
 | ||
| begin
 | ||
|   for i:=1 to length(s) do if ord(s[i])>127 then exit(false);
 | ||
|   Result:=true;
 | ||
| end;
 | ||
| 
 | ||
| function UTF8ToSys(const s: string): string;
 | ||
| begin
 | ||
|   if NeedRTLAnsi and (not IsASCII(s)) then
 | ||
|     Result:=UTF8ToAnsi(s)
 | ||
|   else
 | ||
|     Result:=s;
 | ||
| end;
 | ||
| 
 | ||
| function SysToUTF8(const s: string): string;
 | ||
| begin
 | ||
|   if NeedRTLAnsi and (not IsASCII(s)) then
 | ||
|   begin
 | ||
|     Result:=AnsiToUTF8(s);
 | ||
|     {$ifdef FPC_HAS_CPSTRING}
 | ||
|     // prevent UTF8 codepage appear in the strings - we don't need codepage
 | ||
|     // conversion magic in LCL code
 | ||
|     SetCodePage(RawByteString(Result), StringCodePage(s), False);
 | ||
|     {$endif}
 | ||
|   end
 | ||
|   else
 | ||
|     Result:=s;
 | ||
| end;
 | ||
| 
 | ||
| 
 | ||
| function GetEnvironmentStringUTF8(Index: Integer): String;
 | ||
| begin
 | ||
|   // on Windows SysUtils.GetEnvironmentString returns OEM encoded string
 | ||
|   // so ConsoleToUTF8 function should be used!
 | ||
|   // RTL issue: http://bugs.freepascal.org/view.php?id=15233
 | ||
|   Result:=ConsoleToUTF8(SysUtils.GetEnvironmentString(Index));
 | ||
| end;
 | ||
| 
 | ||
| function GetEnvironmentVariableUTF8(const EnvVar: String): String;
 | ||
| begin
 | ||
|   // on Windows SysUtils.GetEnvironmentString returns OEM encoded string
 | ||
|   // so ConsoleToUTF8 function should be used!
 | ||
|   // RTL issue: http://bugs.freepascal.org/view.php?id=15233
 | ||
|   Result:=ConsoleToUTF8(SysUtils.GetEnvironmentVariable(UTF8ToSys(EnvVar)));
 | ||
| end;
 | ||
| 
 | ||
| function SysErrorMessageUTF8(ErrorCode: Integer): String;
 | ||
| begin
 | ||
|   Result := SysToUTF8(SysUtils.SysErrorMessage(ErrorCode));
 | ||
| end;
 | ||
| 
 | ||
| 
 | ||
| function UTF8CharacterLength(p: PChar): integer;
 | ||
| begin
 | ||
|   if p<>nil then begin
 | ||
|     if ord(p^)<%11000000 then begin
 | ||
|       // regular single byte character (#0 is a character, this is pascal ;)
 | ||
|       Result:=1;
 | ||
|     end
 | ||
|     else begin
 | ||
|       // multi byte
 | ||
|       if ((ord(p^) and %11100000) = %11000000) then begin
 | ||
|         // could be 2 byte character
 | ||
|         if (ord(p[1]) and %11000000) = %10000000 then
 | ||
|           Result:=2
 | ||
|         else
 | ||
|           Result:=1;
 | ||
|       end
 | ||
|       else if ((ord(p^) and %11110000) = %11100000) then begin
 | ||
|         // could be 3 byte character
 | ||
|         if ((ord(p[1]) and %11000000) = %10000000)
 | ||
|         and ((ord(p[2]) and %11000000) = %10000000) then
 | ||
|           Result:=3
 | ||
|         else
 | ||
|           Result:=1;
 | ||
|       end
 | ||
|       else if ((ord(p^) and %11111000) = %11110000) then begin
 | ||
|         // could be 4 byte character
 | ||
|         if ((ord(p[1]) and %11000000) = %10000000)
 | ||
|         and ((ord(p[2]) and %11000000) = %10000000)
 | ||
|         and ((ord(p[3]) and %11000000) = %10000000) then
 | ||
|           Result:=4
 | ||
|         else
 | ||
|           Result:=1;
 | ||
|       end
 | ||
|       else
 | ||
|         Result:=1;
 | ||
|     end;
 | ||
|   end else
 | ||
|     Result:=0;
 | ||
| end;
 | ||
| 
 | ||
| function UTF8Length(const s: string): PtrInt;
 | ||
| begin
 | ||
|   Result:=UTF8Length(PChar(s),length(s));
 | ||
| end;
 | ||
| 
 | ||
| function UTF8Length(p: PChar; ByteCount: PtrInt): PtrInt;
 | ||
| var
 | ||
|   CharLen: LongInt;
 | ||
| begin
 | ||
|   Result:=0;
 | ||
|   while (ByteCount>0) do begin
 | ||
|     inc(Result);
 | ||
|     CharLen:=UTF8CharacterLength(p);
 | ||
|     inc(p,CharLen);
 | ||
|     dec(ByteCount,CharLen);
 | ||
|   end;
 | ||
| end;
 | ||
| 
 | ||
| function UTF8CharacterToUnicode(p: PChar; out CharLen: integer): Cardinal;
 | ||
| { if p=nil then CharLen=0 otherwise CharLen>0
 | ||
|   If there is an encoding error the Result is undefined.
 | ||
|   Use UTF8FixBroken to fix UTF-8 encoding.
 | ||
|   It does not check if the codepoint is defined in the Unicode tables.
 | ||
| }
 | ||
| begin
 | ||
|   if p<>nil then begin
 | ||
|     if ord(p^)<%11000000 then begin
 | ||
|       // regular single byte character (#0 is a normal char, this is pascal ;)
 | ||
|       Result:=ord(p^);
 | ||
|       CharLen:=1;
 | ||
|     end
 | ||
|     else if ((ord(p^) and %11100000) = %11000000) then begin
 | ||
|       // starts with %110 => could be double byte character
 | ||
|       if (ord(p[1]) and %11000000) = %10000000 then begin
 | ||
|         CharLen:=2;
 | ||
|         Result:=((ord(p^) and %00011111) shl 6)
 | ||
|                 or (ord(p[1]) and %00111111);
 | ||
|         if Result<(1 shl 7) then begin
 | ||
|           // wrong encoded, could be an XSS attack
 | ||
|           Result:=0;
 | ||
|         end;
 | ||
|       end else begin
 | ||
|         Result:=ord(p^);
 | ||
|         CharLen:=1;
 | ||
|       end;
 | ||
|     end
 | ||
|     else if ((ord(p^) and %11110000) = %11100000) then begin
 | ||
|       // starts with %1110 => could be triple byte character
 | ||
|       if ((ord(p[1]) and %11000000) = %10000000)
 | ||
|       and ((ord(p[2]) and %11000000) = %10000000) then begin
 | ||
|         CharLen:=3;
 | ||
|         Result:=((ord(p^) and %00011111) shl 12)
 | ||
|                 or ((ord(p[1]) and %00111111) shl 6)
 | ||
|                 or (ord(p[2]) and %00111111);
 | ||
|         if Result<(1 shl 11) then begin
 | ||
|           // wrong encoded, could be an XSS attack
 | ||
|           Result:=0;
 | ||
|         end;
 | ||
|       end else begin
 | ||
|         Result:=ord(p^);
 | ||
|         CharLen:=1;
 | ||
|       end;
 | ||
|     end
 | ||
|     else if ((ord(p^) and %11111000) = %11110000) then begin
 | ||
|       // starts with %11110 => could be 4 byte character
 | ||
|       if ((ord(p[1]) and %11000000) = %10000000)
 | ||
|       and ((ord(p[2]) and %11000000) = %10000000)
 | ||
|       and ((ord(p[3]) and %11000000) = %10000000) then begin
 | ||
|         CharLen:=4;
 | ||
|         Result:=((ord(p^) and %00001111) shl 18)
 | ||
|                 or ((ord(p[1]) and %00111111) shl 12)
 | ||
|                 or ((ord(p[2]) and %00111111) shl 6)
 | ||
|                 or (ord(p[3]) and %00111111);
 | ||
|         if Result<(1 shl 16) then begin
 | ||
|           // wrong encoded, could be an XSS attack
 | ||
|           Result:=0;
 | ||
|         end;
 | ||
|       end else begin
 | ||
|         Result:=ord(p^);
 | ||
|         CharLen:=1;
 | ||
|       end;
 | ||
|     end
 | ||
|     else begin
 | ||
|       // invalid character
 | ||
|       Result:=ord(p^);
 | ||
|       CharLen:=1;
 | ||
|     end;
 | ||
|   end else begin
 | ||
|     Result:=0;
 | ||
|     CharLen:=0;
 | ||
|   end;
 | ||
| end;
 | ||
| 
 | ||
| function UnicodeToUTF8(CodePoint: cardinal; Buf: PChar): integer;
 | ||
| 
 | ||
|   procedure RaiseInvalidUnicode;
 | ||
|   begin
 | ||
|     raise Exception.Create('UnicodeToUTF8: invalid unicode: '+IntToStr(CodePoint));
 | ||
|   end;
 | ||
| 
 | ||
| begin
 | ||
|   Result:=UnicodeToUTF8Inline(CodePoint,Buf);
 | ||
|   if Result=0 then
 | ||
|     RaiseInvalidUnicode;
 | ||
| end;
 | ||
| 
 | ||
| function UnicodeToUTF8SkipErrors(CodePoint: cardinal; Buf: PChar): integer;
 | ||
| begin
 | ||
|   Result:=UnicodeToUTF8Inline(CodePoint,Buf);
 | ||
| end;
 | ||
| 
 | ||
| function UnicodeToUTF8(CodePoint: cardinal): string;
 | ||
| var
 | ||
|   Buf: array[0..6] of Char;
 | ||
|   Len: Integer;
 | ||
| begin
 | ||
|   Len:=UnicodeToUTF8Inline(CodePoint, @Buf[0]);
 | ||
|   Buf[Len]:=#0;
 | ||
|   Result := StrPas(@Buf[0]);
 | ||
| end;
 | ||
| 
 | ||
| function UnicodeToUTF8Inline(CodePoint: cardinal; Buf: PChar): integer;
 | ||
| begin
 | ||
|   case CodePoint of
 | ||
|     0..$7f:
 | ||
|       begin
 | ||
|         Result:=1;
 | ||
|         Buf[0]:=char(byte(CodePoint));
 | ||
|       end;
 | ||
|     $80..$7ff:
 | ||
|       begin
 | ||
|         Result:=2;
 | ||
|         Buf[0]:=char(byte($c0 or (CodePoint shr 6)));
 | ||
|         Buf[1]:=char(byte($80 or (CodePoint and $3f)));
 | ||
|       end;
 | ||
|     $800..$ffff:
 | ||
|       begin
 | ||
|         Result:=3;
 | ||
|         Buf[0]:=char(byte($e0 or (CodePoint shr 12)));
 | ||
|         Buf[1]:=char(byte((CodePoint shr 6) and $3f) or $80);
 | ||
|         Buf[2]:=char(byte(CodePoint and $3f) or $80);
 | ||
|       end;
 | ||
|     $10000..$10ffff:
 | ||
|       begin
 | ||
|         Result:=4;
 | ||
|         Buf[0]:=char(byte($f0 or (CodePoint shr 18)));
 | ||
|         Buf[1]:=char(byte((CodePoint shr 12) and $3f) or $80);
 | ||
|         Buf[2]:=char(byte((CodePoint shr 6) and $3f) or $80);
 | ||
|         Buf[3]:=char(byte(CodePoint and $3f) or $80);
 | ||
|       end;
 | ||
|   else
 | ||
|     Result:=0;
 | ||
|   end;
 | ||
| end;
 | ||
| 
 | ||
| function UTF8ToDoubleByteString(const s: string): string;
 | ||
| var
 | ||
|   Len: Integer;
 | ||
| begin
 | ||
|   Len:=UTF8Length(s);
 | ||
|   SetLength(Result,Len*2);
 | ||
|   if Len=0 then exit;
 | ||
|   UTF8ToDoubleByte(PChar(s),length(s),PByte(Result));
 | ||
| end;
 | ||
| 
 | ||
| { returns number of double bytes }
 | ||
| function UTF8ToDoubleByte(UTF8Str: PChar; Len: PtrInt; DBStr: PByte): PtrInt;
 | ||
| var
 | ||
|   SrcPos: PChar;
 | ||
|   CharLen: LongInt;
 | ||
|   DestPos: PByte;
 | ||
|   u: Cardinal;
 | ||
| begin
 | ||
|   SrcPos:=UTF8Str;
 | ||
|   DestPos:=DBStr;
 | ||
|   Result:=0;
 | ||
|   while Len>0 do begin
 | ||
|     u:=UTF8CharacterToUnicode(SrcPos,CharLen);
 | ||
|     DestPos^:=byte((u shr 8) and $ff);
 | ||
|     inc(DestPos);
 | ||
|     DestPos^:=byte(u and $ff);
 | ||
|     inc(DestPos);
 | ||
|     inc(SrcPos,CharLen);
 | ||
|     dec(Len,CharLen);
 | ||
|     inc(Result);
 | ||
|   end;
 | ||
| end;
 | ||
| 
 | ||
| { Find the start of the UTF8 character which contains BytePos,
 | ||
|   Len is length in byte, BytePos starts at 0 }
 | ||
| function UTF8FindNearestCharStart(UTF8Str: PChar; Len: integer;
 | ||
|   BytePos: integer): integer;
 | ||
| begin
 | ||
|   Result:=0;
 | ||
|   if (UTF8Str<>nil) and (Len>0) and (BytePos>=0) then begin
 | ||
|     Result:=BytePos;
 | ||
|     if Result>Len then Result:=Len-1;
 | ||
|     if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
 | ||
|       dec(Result);
 | ||
|       if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
 | ||
|         dec(Result);
 | ||
|         if (Result>0) and (ord(UTF8Str[Result]) and %11000000=%10000000) then begin
 | ||
|           dec(Result);
 | ||
|           // should be four byte character
 | ||
|           if (ord(UTF8Str[Result]) and %11111000<>%11110000) then begin
 | ||
|             // broken UTF8 character
 | ||
|             inc(Result,3);
 | ||
|           end else begin
 | ||
|             // is four byte character
 | ||
|           end;
 | ||
|         end else if (ord(UTF8Str[Result]) and %11110000<>%11100000) then begin
 | ||
|           // broken UTF8 character, should be three byte
 | ||
|           inc(Result,2);
 | ||
|         end else
 | ||
|         begin
 | ||
|           // is three byte character
 | ||
|         end;
 | ||
|       end else if (ord(UTF8Str[Result]) and %11100000<>%11000000) then begin
 | ||
|         // broken UTF8 character, should be two byte
 | ||
|         inc(Result);
 | ||
|       end else
 | ||
|       begin
 | ||
|         // is two byte character
 | ||
|       end;
 | ||
|     end;
 | ||
|   end;
 | ||
| end;
 | ||
| 
 | ||
| { Len is the length in bytes of UTF8Str
 | ||
|   CharIndex is the position of the desired char (starting at 0), in chars
 | ||
| }
 | ||
| function UTF8CharStart(UTF8Str: PChar; Len, CharIndex: PtrInt): PChar;
 | ||
| var
 | ||
|   CharLen: LongInt;
 | ||
| begin
 | ||
|   Result:=UTF8Str;
 | ||
|   if Result<>nil then begin
 | ||
|     while (CharIndex>0) and (Len>0) do begin
 | ||
|       CharLen:=UTF8CharacterLength(Result);
 | ||
|       dec(Len,CharLen);
 | ||
|       dec(CharIndex);
 | ||
|       inc(Result,CharLen);
 | ||
|     end;
 | ||
|     if (CharIndex<>0) or (Len<0) then
 | ||
|       Result:=nil;
 | ||
|   end;
 | ||
| end;
 | ||
| 
 | ||
| function UTF8CharToByteIndex(UTF8Str: PChar; Len, CharIndex: PtrInt): PtrInt;
 | ||
| var
 | ||
|   p: PChar;
 | ||
| begin
 | ||
|   p := UTF8CharStart(UTF8Str, Len, CharIndex);
 | ||
|   if p = nil
 | ||
|   then Result := -1
 | ||
|   else Result := p - UTF8Str;
 | ||
| end;
 | ||
| 
 | ||
| { fix any broken UTF8 sequences with spaces }
 | ||
| procedure UTF8FixBroken(P: PChar);
 | ||
| var
 | ||
|   c: cardinal;
 | ||
| begin
 | ||
|   if p=nil then exit;
 | ||
|   while p^<>#0 do begin
 | ||
|     if ord(p^)<%10000000 then begin
 | ||
|       // regular single byte character
 | ||
|       inc(p);
 | ||
|     end
 | ||
|     else if ord(p^)<%11000000 then begin
 | ||
|       // invalid
 | ||
|       p^:=' ';
 | ||
|       inc(p);
 | ||
|     end
 | ||
|     else if ((ord(p^) and %11100000) = %11000000) then begin
 | ||
|       // starts with %110 => should be 2 byte character
 | ||
|       if ((ord(p[1]) and %11000000) = %10000000) then begin
 | ||
|         c:=((ord(p^) and %00011111) shl 6);
 | ||
|            //or (ord(p[1]) and %00111111);
 | ||
|         if c<(1 shl 7) then
 | ||
|           p^:=' '  // fix XSS attack
 | ||
|         else
 | ||
|           inc(p,2)
 | ||
|       end
 | ||
|       else if p[1]<>#0 then
 | ||
|         p^:=' ';
 | ||
|     end
 | ||
|     else if ((ord(p^) and %11110000) = %11100000) then begin
 | ||
|       // starts with %1110 => should be 3 byte character
 | ||
|       if ((ord(p[1]) and %11000000) = %10000000)
 | ||
|       and ((ord(p[2]) and %11000000) = %10000000) then begin
 | ||
|         c:=((ord(p^) and %00011111) shl 12)
 | ||
|            or ((ord(p[1]) and %00111111) shl 6);
 | ||
|            //or (ord(p[2]) and %00111111);
 | ||
|         if c<(1 shl 11) then
 | ||
|           p^:=' '  // fix XSS attack
 | ||
|         else
 | ||
|           inc(p,3);
 | ||
|       end else
 | ||
|         p^:=' ';
 | ||
|     end
 | ||
|     else if ((ord(p^) and %11111000) = %11110000) then begin
 | ||
|       // starts with %11110 => should be 4 byte character
 | ||
|       if ((ord(p[1]) and %11000000) = %10000000)
 | ||
|       and ((ord(p[2]) and %11000000) = %10000000)
 | ||
|       and ((ord(p[3]) and %11000000) = %10000000) then begin
 | ||
|         c:=((ord(p^) and %00001111) shl 18)
 | ||
|            or ((ord(p[1]) and %00111111) shl 12)
 | ||
|            or ((ord(p[2]) and %00111111) shl 6);
 | ||
|            //or (ord(p[3]) and %00111111);
 | ||
|         if c<(1 shl 16) then
 | ||
|           p^:=' ' // fix XSS attack
 | ||
|         else
 | ||
|           inc(p,4)
 | ||
|       end else
 | ||
|         p^:=' ';
 | ||
|     end
 | ||
|     else begin
 | ||
|       p^:=' ';
 | ||
|       inc(p);
 | ||
|     end;
 | ||
|   end;
 | ||
| end;
 | ||
| 
 | ||
| procedure UTF8FixBroken(var S: string);
 | ||
| begin
 | ||
|   if S='' then exit;
 | ||
|   if FindInvalidUTF8Character(PChar(S),length(S),true)<0 then exit;
 | ||
|   UniqueString(S);
 | ||
|   UTF8FixBroken(PChar(S));
 | ||
| end;
 | ||
| 
 | ||
| function UTF8CharacterStrictLength(P: PChar): integer;
 | ||
| begin
 | ||
|   if p=nil then exit(0);
 | ||
|   if ord(p^)<%10000000 then begin
 | ||
|     // regular single byte character
 | ||
|     exit(1);
 | ||
|   end
 | ||
|   else if ord(p^)<%11000000 then begin
 | ||
|     // invalid single byte character
 | ||
|     exit(0);
 | ||
|   end
 | ||
|   else if ((ord(p^) and %11100000) = %11000000) then begin
 | ||
|     // should be 2 byte character
 | ||
|     if (ord(p[1]) and %11000000) = %10000000 then
 | ||
|       exit(2)
 | ||
|     else
 | ||
|       exit(0);
 | ||
|   end
 | ||
|   else if ((ord(p^) and %11110000) = %11100000) then begin
 | ||
|     // should be 3 byte character
 | ||
|     if ((ord(p[1]) and %11000000) = %10000000)
 | ||
|     and ((ord(p[2]) and %11000000) = %10000000) then
 | ||
|       exit(3)
 | ||
|     else
 | ||
|       exit(0);
 | ||
|   end
 | ||
|   else if ((ord(p^) and %11111000) = %11110000) then begin
 | ||
|     // should be 4 byte character
 | ||
|     if ((ord(p[1]) and %11000000) = %10000000)
 | ||
|     and ((ord(p[2]) and %11000000) = %10000000)
 | ||
|     and ((ord(p[3]) and %11000000) = %10000000) then
 | ||
|       exit(4)
 | ||
|     else
 | ||
|       exit(0);
 | ||
|   end else
 | ||
|     exit(0);
 | ||
| end;
 | ||
| 
 | ||
| function UTF8CStringToUTF8String(SourceStart: PChar; SourceLen: PtrInt) : string;
 | ||
| var
 | ||
|   Source: PChar;
 | ||
|   Dest: PChar;
 | ||
|   SourceEnd: PChar;
 | ||
|   SourceCopied: PChar;
 | ||
| 
 | ||
|   // Copies from SourceStart till Source to Dest and updates Dest
 | ||
|   procedure CopyPart; inline;
 | ||
|   var
 | ||
|     CopyLength: SizeInt;
 | ||
|   begin
 | ||
|     CopyLength := Source - SourceCopied;
 | ||
|     if CopyLength=0 then exit;
 | ||
|     System.move(SourceCopied^ , Dest^, CopyLength);
 | ||
|     SourceCopied:=Source;
 | ||
|     inc(Dest, CopyLength);
 | ||
|   end;
 | ||
| 
 | ||
| begin
 | ||
|   SetLength(Result, SourceLen);
 | ||
|   if SourceLen=0 then exit;
 | ||
|   SourceCopied:=SourceStart;
 | ||
|   Source:=SourceStart;
 | ||
|   Dest:=PChar(Result);
 | ||
|   SourceEnd := Source + SourceLen;
 | ||
|   while Source<SourceEnd do begin
 | ||
|     if (Source^='\') then begin
 | ||
|       CopyPart;
 | ||
|       inc(Source);
 | ||
|       if Source^ in ['t', 'n', '"', '\'] then begin
 | ||
|         case Source^ of
 | ||
|          't' : Dest^ := #9;
 | ||
|          '"' : Dest^ := '"';
 | ||
|          '\' : Dest^ := '\';
 | ||
|          'n' :
 | ||
|          // fpc 2.1.1 stores string constants as array of char so maybe this
 | ||
|          // will work for without ifdef (once available in 2.0.x too):
 | ||
|          // move(lineending, dest^, sizeof(LineEnding));
 | ||
| {$IFDEF WINDOWS}
 | ||
|                begin
 | ||
|                  move(lineending[1], dest^, length(LineEnding));
 | ||
|                  inc(dest, length(LineEnding)-1);
 | ||
|                end;
 | ||
| {$ELSE}
 | ||
|                Dest^ := LineEnding;
 | ||
| {$ENDIF}
 | ||
|         end;
 | ||
|         inc(Source);
 | ||
|         inc(Dest);
 | ||
|       end;
 | ||
|       SourceCopied := Source;
 | ||
|     end
 | ||
|     else
 | ||
|       Inc(Source); // no need for checking for UTF8, the / is never part of an UTF8 multibyte codepoint
 | ||
|   end;
 | ||
|   CopyPart;
 | ||
|   SetLength(Result, Dest - PChar(Result));
 | ||
| end;
 | ||
| 
 | ||
| function UTF8Pos(const SearchForText, SearchInText: string;
 | ||
|   StartPos: SizeInt = 1): PtrInt;
 | ||
| // returns the character index, where the SearchForText starts in SearchInText
 | ||
| // an optional StartPos can be given (in UTF-8 codepoints, not in byte)
 | ||
| // returns 0 if not found
 | ||
| var
 | ||
|   i: SizeInt;
 | ||
|   p: PChar;
 | ||
|   StartPosP: PChar;
 | ||
| begin
 | ||
|   Result:=0;
 | ||
|   if StartPos=1 then
 | ||
|   begin
 | ||
|     i:=System.Pos(SearchForText,SearchInText);
 | ||
|     if i>0 then
 | ||
|       Result:=UTF8Length(PChar(SearchInText),i-1)+1;
 | ||
|   end
 | ||
|   else if StartPos>1 then
 | ||
|   begin
 | ||
|     // skip
 | ||
|     StartPosP:=UTF8CharStart(PChar(SearchInText),Length(SearchInText),StartPos-1);
 | ||
|     if StartPosP=nil then exit;
 | ||
|     // search
 | ||
|     p:=UTF8PosP(PChar(SearchForText),length(SearchForText),
 | ||
|                 StartPosP,length(SearchInText)+PChar(SearchInText)-StartPosP);
 | ||
|     // get UTF-8 position
 | ||
|     if p=nil then exit;
 | ||
|     Result:=StartPos+UTF8Length(StartPosP,p-StartPosP);
 | ||
|   end;
 | ||
| end;
 | ||
| 
 | ||
| function UTF8PosP(SearchForText: PChar; SearchForTextLen: SizeInt;
 | ||
|   SearchInText: PChar; SearchInTextLen: SizeInt): PChar;
 | ||
| // returns the position where SearchInText starts in SearchForText
 | ||
| // returns nil if not found
 | ||
| var
 | ||
|   p: SizeInt;
 | ||
| begin
 | ||
|   Result:=nil;
 | ||
|   if (SearchForText=nil) or (SearchForTextLen=0) or (SearchInText=nil) then
 | ||
|     exit;
 | ||
|   while SearchInTextLen>0 do begin
 | ||
|     p:=IndexByte(SearchInText^,SearchInTextLen,PByte(SearchForText)^);
 | ||
|     if p<0 then exit;
 | ||
|     inc(SearchInText,p);
 | ||
|     dec(SearchInTextLen,p);
 | ||
|     if SearchInTextLen<SearchForTextLen then exit;
 | ||
|     if CompareMem(SearchInText,SearchForText,SearchForTextLen) then
 | ||
|       exit(SearchInText);
 | ||
|     inc(SearchInText);
 | ||
|     dec(SearchInTextLen);
 | ||
|   end;
 | ||
| end;
 | ||
| 
 | ||
| function UTF8Copy(const s: string; StartCharIndex, CharCount: PtrInt): string;
 | ||
| // returns substring
 | ||
| var
 | ||
|   StartBytePos: PChar;
 | ||
|   EndBytePos: PChar;
 | ||
|   MaxBytes: PtrInt;
 | ||
| begin
 | ||
|   StartBytePos:=UTF8CharStart(PChar(s),length(s),StartCharIndex-1);
 | ||
|   if StartBytePos=nil then
 | ||
|     Result:=''
 | ||
|   else begin
 | ||
|     MaxBytes:=PtrInt(PChar(s)+length(s)-StartBytePos);
 | ||
|     EndBytePos:=UTF8CharStart(StartBytePos,MaxBytes,CharCount);
 | ||
|     if EndBytePos=nil then
 | ||
|       Result:=copy(s,StartBytePos-PChar(s)+1,MaxBytes)
 | ||
|     else
 | ||
|       Result:=copy(s,StartBytePos-PChar(s)+1,EndBytePos-StartBytePos);
 | ||
|   end;
 | ||
| end;
 | ||
| 
 | ||
| procedure UTF8Delete(var s: String; StartCharIndex, CharCount: PtrInt);
 | ||
| var
 | ||
|   StartBytePos: PChar;
 | ||
|   EndBytePos: PChar;
 | ||
|   MaxBytes: PtrInt;
 | ||
| begin
 | ||
|   StartBytePos:=UTF8CharStart(PChar(s),length(s),StartCharIndex-1);
 | ||
|   if StartBytePos <> nil then
 | ||
|   begin
 | ||
|     MaxBytes:=PtrInt(PChar(s)+length(s)-StartBytePos);
 | ||
|     EndBytePos:=UTF8CharStart(StartBytePos,MaxBytes,CharCount);
 | ||
|     if EndBytePos=nil then
 | ||
|       Delete(s,StartBytePos-PChar(s)+1,MaxBytes)
 | ||
|     else
 | ||
|       Delete(s,StartBytePos-PChar(s)+1,EndBytePos-StartBytePos);
 | ||
|   end;
 | ||
| end;
 | ||
| 
 | ||
| procedure UTF8Insert(const source: String; var s: string; StartCharIndex: PtrInt);
 | ||
| var
 | ||
|   StartBytePos: PChar;
 | ||
| begin
 | ||
|   StartBytePos:=UTF8CharStart(PChar(s),length(s),StartCharIndex-1);
 | ||
|   if StartBytePos <> nil then
 | ||
|     Insert(source, s, StartBytePos-PChar(s)+1);
 | ||
| end;
 | ||
| 
 | ||
| {
 | ||
|   AInStr - The input string
 | ||
|   ALanguage - The language. Use '' for maximum speed if one desires to ignore the language
 | ||
|               The language should be specified in the format from ISO 639-1,
 | ||
|               which uses 2 characters to represent each language.
 | ||
|               If the language has no code in ISO 639-1, then the 3-chars code
 | ||
|               from ISO 639-2 should be used.
 | ||
|               Example: "tr" - Turkish language locale
 | ||
| 
 | ||
|   Data from here: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
 | ||
| 
 | ||
|   The columns in the file UnicodeData.txt are explained here:
 | ||
|   http://www.ksu.ru/eng/departments/ktk/test/perl/lib/unicode/UCDFF301.html#Case Mappings
 | ||
| }
 | ||
| function UTF8LowerCase(const AInStr: string; ALanguage: string=''): string;
 | ||
| var
 | ||
|   CounterDiff: PtrInt;
 | ||
|   InStr, InStrEnd, OutStr: PChar;
 | ||
|   // Language identification
 | ||
|   IsTurkish: Boolean;
 | ||
|   c1, c2, c3, new_c1, new_c2, new_c3: Char;
 | ||
|   p: SizeInt;
 | ||
| begin
 | ||
|   Result:=AInStr;
 | ||
|   InStr := PChar(AInStr);
 | ||
|   InStrEnd := InStr + length(AInStr); // points behind last char
 | ||
| 
 | ||
|   // Does a fast initial parsing of the string to maybe avoid doing
 | ||
|   // UniqueString if the resulting string will be identical
 | ||
|   while (InStr < InStrEnd) do
 | ||
|   begin
 | ||
|     c1 := InStr^;
 | ||
|     case c1 of
 | ||
|     'A'..'Z': Break;
 | ||
|     #$C3..#$FF:
 | ||
|       case c1 of
 | ||
|       #$C3..#$C9, #$CE, #$CF, #$D0..#$D5, #$E1..#$E2,#$E5:
 | ||
|         begin
 | ||
|           c2 := InStr[1];
 | ||
|           case c1 of
 | ||
|           #$C3: if c2 in [#$80..#$9E] then Break;
 | ||
|           #$C4:
 | ||
|           begin
 | ||
|             case c2 of
 | ||
|             #$80..#$AF, #$B2..#$B6: if ord(c2) mod 2 = 0 then Break;
 | ||
|             #$B8..#$FF: if ord(c2) mod 2 = 1 then Break;
 | ||
|             #$B0: Break;
 | ||
|             end;
 | ||
|           end;
 | ||
|           #$C5:
 | ||
|           begin
 | ||
|             case c2 of
 | ||
|               #$8A..#$B7: if ord(c2) mod 2 = 0 then Break;
 | ||
|               #$00..#$88, #$B9..#$FF: if ord(c2) mod 2 = 1 then Break;
 | ||
|               #$B8: Break;
 | ||
|             end;
 | ||
|           end;
 | ||
|           // Process E5 to avoid stopping on chinese chars
 | ||
|           #$E5: if (c2 = #$BC) and (InStr[2] in [#$A1..#$BA]) then Break;
 | ||
|           // Others are too complex, better not to pre-inspect them
 | ||
|           else
 | ||
|             Break;
 | ||
|           end;
 | ||
|           // already lower, or otherwhise not affected
 | ||
|         end;
 | ||
|       end;
 | ||
|     end;
 | ||
|     inc(InStr);
 | ||
|   end;
 | ||
| 
 | ||
|   if InStr >= InStrEnd then Exit;
 | ||
| 
 | ||
|   // Language identification
 | ||
|   IsTurkish := (ALanguage = 'tr') or (ALanguage = 'az'); // Turkish and Azeri have a special handling
 | ||
| 
 | ||
|   UniqueString(Result);
 | ||
|   OutStr := PChar(Result) + (InStr - PChar(AInStr));
 | ||
|   CounterDiff := 0;
 | ||
| 
 | ||
|   while InStr < InStrEnd do
 | ||
|   begin
 | ||
|     c1 := InStr^;
 | ||
|     case c1 of
 | ||
|       // codepoints      UTF-8 range           Description                Case change
 | ||
|       // $0041..$005A    $41..$5A              Capital ASCII              X+$20
 | ||
|       'A'..'Z':
 | ||
|       begin
 | ||
|         { First ASCII chars }
 | ||
|         // Special turkish handling
 | ||
|         // capital undotted I to small undotted i
 | ||
|         if IsTurkish and (c1 = 'I') then
 | ||
|         begin
 | ||
|           p:=OutStr - PChar(Result);
 | ||
|           SetLength(Result,Length(Result)+1);// Increase the buffer
 | ||
|           OutStr := PChar(Result)+p;
 | ||
|           OutStr^ := #$C4;
 | ||
|           inc(OutStr);
 | ||
|           OutStr^ := #$B1;
 | ||
|           dec(CounterDiff);
 | ||
|         end
 | ||
|         else
 | ||
|         begin
 | ||
|           OutStr^ := chr(ord(c1)+32);
 | ||
|         end;
 | ||
|         inc(InStr);
 | ||
|         inc(OutStr);
 | ||
|       end;
 | ||
| 
 | ||
|       // Chars with 2-bytes which might be modified
 | ||
|       #$C3..#$D5:
 | ||
|       begin
 | ||
|         c2 := InStr[1];
 | ||
|         new_c1 := c1;
 | ||
|         new_c2 := c2;
 | ||
|         case c1 of
 | ||
|         // Latin Characters 0000–0FFF http://en.wikibooks.org/wiki/Unicode/Character_reference/0000-0FFF
 | ||
|         // codepoints      UTF-8 range           Description                Case change
 | ||
|         // $00C0..$00D6    C3 80..C3 96          Capital Latin with accents X+$20
 | ||
|         // $D7             C3 97                 Multiplication Sign        N/A
 | ||
|         // $00D8..$00DE    C3 98..C3 9E          Capital Latin with accents X+$20
 | ||
|         // $DF             C3 9F                 German beta ß              already lowercase
 | ||
|         #$C3:
 | ||
|         begin
 | ||
|           case c2 of
 | ||
|           #$80..#$96, #$98..#$9E: new_c2 := chr(ord(c2) + $20)
 | ||
|           end;
 | ||
|         end;
 | ||
|         // $0100..$012F    C4 80..C4 AF        Capital/Small Latin accents  if mod 2 = 0 then X+1
 | ||
|         // $0130..$0131    C4 B0..C4 B1        Turkish
 | ||
|         //  C4 B0 turkish uppercase dotted i -> 'i'
 | ||
|         //  C4 B1 turkish lowercase undotted ı
 | ||
|         // $0132..$0137    C4 B2..C4 B7        Capital/Small Latin accents  if mod 2 = 0 then X+1
 | ||
|         // $0138           C4 B8               ĸ                            N/A
 | ||
|         // $0139..$024F    C4 B9..C5 88        Capital/Small Latin accents  if mod 2 = 1 then X+1
 | ||
|         #$C4:
 | ||
|         begin
 | ||
|           case c2 of
 | ||
|             #$80..#$AF, #$B2..#$B7: if ord(c2) mod 2 = 0 then new_c2 := chr(ord(c2) + 1);
 | ||
|             #$B0: // Turkish
 | ||
|             begin
 | ||
|               OutStr^ := 'i';
 | ||
|               inc(InStr, 2);
 | ||
|               inc(OutStr);
 | ||
|               inc(CounterDiff, 1);
 | ||
|               Continue;
 | ||
|             end;
 | ||
|             #$B9..#$BE: if ord(c2) mod 2 = 1 then new_c2 := chr(ord(c2) + 1);
 | ||
|             #$BF: // This crosses the borders between the first byte of the UTF-8 char
 | ||
|             begin
 | ||
|               new_c1 := #$C5;
 | ||
|               new_c2 := #$80;
 | ||
|             end;
 | ||
|           end;
 | ||
|         end;
 | ||
|         // $C589 ʼn
 | ||
|         // $C58A..$C5B7: if OldChar mod 2 = 0 then NewChar := OldChar + 1;
 | ||
|         // $C5B8:        NewChar := $C3BF; // Ÿ
 | ||
|         // $C5B9..$C8B3: if OldChar mod 2 = 1 then NewChar := OldChar + 1;
 | ||
|         #$C5:
 | ||
|         begin
 | ||
|           case c2 of
 | ||
|             #$8A..#$B7: //0
 | ||
|             begin
 | ||
|               if ord(c2) mod 2 = 0 then
 | ||
|                 new_c2 := chr(ord(c2) + 1);
 | ||
|             end;
 | ||
|             #$00..#$88, #$B9..#$BE: //1
 | ||
|             begin
 | ||
|               if ord(c2) mod 2 = 1 then
 | ||
|                 new_c2 := chr(ord(c2) + 1);
 | ||
|             end;
 | ||
|             #$B8:  // Ÿ
 | ||
|             begin
 | ||
|               new_c1 := #$C3;
 | ||
|               new_c2 := #$BF;
 | ||
|             end;
 | ||
|           end;
 | ||
|         end;
 | ||
|         {A convoluted part: C6 80..C6 8F
 | ||
| 
 | ||
|         0180;LATIN SMALL LETTER B WITH STROKE;Ll;0;L;;;;;N;LATIN SMALL LETTER B BAR;;0243;;0243
 | ||
|         0181;LATIN CAPITAL LETTER B WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER B HOOK;;;0253; => C6 81=>C9 93
 | ||
|         0182;LATIN CAPITAL LETTER B WITH TOPBAR;Lu;0;L;;;;;N;LATIN CAPITAL LETTER B TOPBAR;;;0183;
 | ||
|         0183;LATIN SMALL LETTER B WITH TOPBAR;Ll;0;L;;;;;N;LATIN SMALL LETTER B TOPBAR;;0182;;0182
 | ||
|         0184;LATIN CAPITAL LETTER TONE SIX;Lu;0;L;;;;;N;;;;0185;
 | ||
|         0185;LATIN SMALL LETTER TONE SIX;Ll;0;L;;;;;N;;;0184;;0184
 | ||
|         0186;LATIN CAPITAL LETTER OPEN O;Lu;0;L;;;;;N;;;;0254; ==> C9 94
 | ||
|         0187;LATIN CAPITAL LETTER C WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER C HOOK;;;0188;
 | ||
|         0188;LATIN SMALL LETTER C WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER C HOOK;;0187;;0187
 | ||
|         0189;LATIN CAPITAL LETTER AFRICAN D;Lu;0;L;;;;;N;;;;0256; => C9 96
 | ||
|         018A;LATIN CAPITAL LETTER D WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER D HOOK;;;0257; => C9 97
 | ||
|         018B;LATIN CAPITAL LETTER D WITH TOPBAR;Lu;0;L;;;;;N;LATIN CAPITAL LETTER D TOPBAR;;;018C;
 | ||
|         018C;LATIN SMALL LETTER D WITH TOPBAR;Ll;0;L;;;;;N;LATIN SMALL LETTER D TOPBAR;;018B;;018B
 | ||
|         018D;LATIN SMALL LETTER TURNED DELTA;Ll;0;L;;;;;N;;;;;
 | ||
|         018E;LATIN CAPITAL LETTER REVERSED E;Lu;0;L;;;;;N;LATIN CAPITAL LETTER TURNED E;;;01DD; => C7 9D
 | ||
|         018F;LATIN CAPITAL LETTER SCHWA;Lu;0;L;;;;;N;;;;0259; => C9 99
 | ||
|         }
 | ||
|         #$C6:
 | ||
|         begin
 | ||
|           case c2 of
 | ||
|             #$81:
 | ||
|             begin
 | ||
|               new_c1 := #$C9;
 | ||
|               new_c2 := #$93;
 | ||
|             end;
 | ||
|             #$82..#$85:
 | ||
|             begin
 | ||
|               if ord(c2) mod 2 = 0 then
 | ||
|                 new_c2 := chr(ord(c2) + 1);
 | ||
|             end;
 | ||
|             #$87..#$88,#$8B..#$8C:
 | ||
|             begin
 | ||
|               if ord(c2) mod 2 = 1 then
 | ||
|                 new_c2 := chr(ord(c2) + 1);
 | ||
|             end;
 | ||
|             #$86:
 | ||
|             begin
 | ||
|               new_c1 := #$C9;
 | ||
|               new_c2 := #$94;
 | ||
|             end;
 | ||
|             #$89:
 | ||
|             begin
 | ||
|               new_c1 := #$C9;
 | ||
|               new_c2 := #$96;
 | ||
|             end;
 | ||
|             #$8A:
 | ||
|             begin
 | ||
|               new_c1 := #$C9;
 | ||
|               new_c2 := #$97;
 | ||
|             end;
 | ||
|             #$8E:
 | ||
|             begin
 | ||
|               new_c1 := #$C7;
 | ||
|               new_c2 := #$9D;
 | ||
|             end;
 | ||
|             #$8F:
 | ||
|             begin
 | ||
|               new_c1 := #$C9;
 | ||
|               new_c2 := #$99;
 | ||
|             end;
 | ||
|           {
 | ||
|           And also C6 90..C6 9F
 | ||
| 
 | ||
|           0190;LATIN CAPITAL LETTER OPEN E;Lu;0;L;;;;;N;LATIN CAPITAL LETTER EPSILON;;;025B; => C9 9B
 | ||
|           0191;LATIN CAPITAL LETTER F WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER F HOOK;;;0192; => +1
 | ||
|           0192;LATIN SMALL LETTER F WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER SCRIPT F;;0191;;0191 <=
 | ||
|           0193;LATIN CAPITAL LETTER G WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER G HOOK;;;0260; => C9 A0
 | ||
|           0194;LATIN CAPITAL LETTER GAMMA;Lu;0;L;;;;;N;;;;0263; => C9 A3
 | ||
|           0195;LATIN SMALL LETTER HV;Ll;0;L;;;;;N;LATIN SMALL LETTER H V;;01F6;;01F6 <=
 | ||
|           0196;LATIN CAPITAL LETTER IOTA;Lu;0;L;;;;;N;;;;0269; => C9 A9
 | ||
|           0197;LATIN CAPITAL LETTER I WITH STROKE;Lu;0;L;;;;;N;LATIN CAPITAL LETTER BARRED I;;;0268; => C9 A8
 | ||
|           0198;LATIN CAPITAL LETTER K WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER K HOOK;;;0199; => +1
 | ||
|           0199;LATIN SMALL LETTER K WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER K HOOK;;0198;;0198 <=
 | ||
|           019A;LATIN SMALL LETTER L WITH BAR;Ll;0;L;;;;;N;LATIN SMALL LETTER BARRED L;;023D;;023D <=
 | ||
|           019B;LATIN SMALL LETTER LAMBDA WITH STROKE;Ll;0;L;;;;;N;LATIN SMALL LETTER BARRED LAMBDA;;;; <=
 | ||
|           019C;LATIN CAPITAL LETTER TURNED M;Lu;0;L;;;;;N;;;;026F; => C9 AF
 | ||
|           019D;LATIN CAPITAL LETTER N WITH LEFT HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER N HOOK;;;0272; => C9 B2
 | ||
|           019E;LATIN SMALL LETTER N WITH LONG RIGHT LEG;Ll;0;L;;;;;N;;;0220;;0220 <=
 | ||
|           019F;LATIN CAPITAL LETTER O WITH MIDDLE TILDE;Lu;0;L;;;;;N;LATIN CAPITAL LETTER BARRED O;;;0275; => C9 B5
 | ||
|           }
 | ||
|           #$90:
 | ||
|           begin
 | ||
|             new_c1 := #$C9;
 | ||
|             new_c2 := #$9B;
 | ||
|           end;
 | ||
|           #$91, #$98: new_c2 := chr(ord(c2)+1);
 | ||
|           #$93:
 | ||
|           begin
 | ||
|             new_c1 := #$C9;
 | ||
|             new_c2 := #$A0;
 | ||
|           end;
 | ||
|           #$94:
 | ||
|           begin
 | ||
|             new_c1 := #$C9;
 | ||
|             new_c2 := #$A3;
 | ||
|           end;
 | ||
|           #$96:
 | ||
|           begin
 | ||
|             new_c1 := #$C9;
 | ||
|             new_c2 := #$A9;
 | ||
|           end;
 | ||
|           #$97:
 | ||
|           begin
 | ||
|             new_c1 := #$C9;
 | ||
|             new_c2 := #$A8;
 | ||
|           end;
 | ||
|           #$9C:
 | ||
|           begin
 | ||
|             new_c1 := #$C9;
 | ||
|             new_c2 := #$AF;
 | ||
|           end;
 | ||
|           #$9D:
 | ||
|           begin
 | ||
|             new_c1 := #$C9;
 | ||
|             new_c2 := #$B2;
 | ||
|           end;
 | ||
|           #$9F:
 | ||
|           begin
 | ||
|             new_c1 := #$C9;
 | ||
|             new_c2 := #$B5;
 | ||
|           end;
 | ||
|           {
 | ||
|           And also C6 A0..C6 AF
 | ||
| 
 | ||
|           01A0;LATIN CAPITAL LETTER O WITH HORN;Lu;0;L;004F 031B;;;;N;LATIN CAPITAL LETTER O HORN;;;01A1; => +1
 | ||
|           01A1;LATIN SMALL LETTER O WITH HORN;Ll;0;L;006F 031B;;;;N;LATIN SMALL LETTER O HORN;;01A0;;01A0 <=
 | ||
|           01A2;LATIN CAPITAL LETTER OI;Lu;0;L;;;;;N;LATIN CAPITAL LETTER O I;;;01A3; => +1
 | ||
|           01A3;LATIN SMALL LETTER OI;Ll;0;L;;;;;N;LATIN SMALL LETTER O I;;01A2;;01A2 <=
 | ||
|           01A4;LATIN CAPITAL LETTER P WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER P HOOK;;;01A5; => +1
 | ||
|           01A5;LATIN SMALL LETTER P WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER P HOOK;;01A4;;01A4 <=
 | ||
|           01A6;LATIN LETTER YR;Lu;0;L;;;;;N;LATIN LETTER Y R;;;0280; => CA 80
 | ||
|           01A7;LATIN CAPITAL LETTER TONE TWO;Lu;0;L;;;;;N;;;;01A8; => +1
 | ||
|           01A8;LATIN SMALL LETTER TONE TWO;Ll;0;L;;;;;N;;;01A7;;01A7 <=
 | ||
|           01A9;LATIN CAPITAL LETTER ESH;Lu;0;L;;;;;N;;;;0283; => CA 83
 | ||
|           01AA;LATIN LETTER REVERSED ESH LOOP;Ll;0;L;;;;;N;;;;;
 | ||
|           01AB;LATIN SMALL LETTER T WITH PALATAL HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER T PALATAL HOOK;;;; <=
 | ||
|           01AC;LATIN CAPITAL LETTER T WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER T HOOK;;;01AD; => +1
 | ||
|           01AD;LATIN SMALL LETTER T WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER T HOOK;;01AC;;01AC <=
 | ||
|           01AE;LATIN CAPITAL LETTER T WITH RETROFLEX HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER T RETROFLEX HOOK;;;0288; => CA 88
 | ||
|           01AF;LATIN CAPITAL LETTER U WITH HORN;Lu;0;L;0055 031B;;;;N;LATIN CAPITAL LETTER U HORN;;;01B0; => +1
 | ||
|           }
 | ||
|           #$A0..#$A5,#$AC:
 | ||
|           begin
 | ||
|             if ord(c2) mod 2 = 0 then
 | ||
|               new_c2 := chr(ord(c2) + 1);
 | ||
|           end;
 | ||
|           #$A7,#$AF:
 | ||
|           begin
 | ||
|             if ord(c2) mod 2 = 1 then
 | ||
|               new_c2 := chr(ord(c2) + 1);
 | ||
|           end;
 | ||
|           #$A6:
 | ||
|           begin
 | ||
|             new_c1 := #$CA;
 | ||
|             new_c2 := #$80;
 | ||
|           end;
 | ||
|           #$A9:
 | ||
|           begin
 | ||
|             new_c1 := #$CA;
 | ||
|             new_c2 := #$83;
 | ||
|           end;
 | ||
|           #$AE:
 | ||
|           begin
 | ||
|             new_c1 := #$CA;
 | ||
|             new_c2 := #$88;
 | ||
|           end;
 | ||
|           {
 | ||
|           And also C6 B0..C6 BF
 | ||
| 
 | ||
|           01B0;LATIN SMALL LETTER U WITH HORN;Ll;0;L;0075 031B;;;;N;LATIN SMALL LETTER U HORN;;01AF;;01AF <= -1
 | ||
|           01B1;LATIN CAPITAL LETTER UPSILON;Lu;0;L;;;;;N;;;;028A; => CA 8A
 | ||
|           01B2;LATIN CAPITAL LETTER V WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER SCRIPT V;;;028B; => CA 8B
 | ||
|           01B3;LATIN CAPITAL LETTER Y WITH HOOK;Lu;0;L;;;;;N;LATIN CAPITAL LETTER Y HOOK;;;01B4; => +1
 | ||
|           01B4;LATIN SMALL LETTER Y WITH HOOK;Ll;0;L;;;;;N;LATIN SMALL LETTER Y HOOK;;01B3;;01B3 <=
 | ||
|           01B5;LATIN CAPITAL LETTER Z WITH STROKE;Lu;0;L;;;;;N;LATIN CAPITAL LETTER Z BAR;;;01B6; => +1
 | ||
|           01B6;LATIN SMALL LETTER Z WITH STROKE;Ll;0;L;;;;;N;LATIN SMALL LETTER Z BAR;;01B5;;01B5 <=
 | ||
|           01B7;LATIN CAPITAL LETTER EZH;Lu;0;L;;;;;N;LATIN CAPITAL LETTER YOGH;;;0292; => CA 92
 | ||
|           01B8;LATIN CAPITAL LETTER EZH REVERSED;Lu;0;L;;;;;N;LATIN CAPITAL LETTER REVERSED YOGH;;;01B9; => +1
 | ||
|           01B9;LATIN SMALL LETTER EZH REVERSED;Ll;0;L;;;;;N;LATIN SMALL LETTER REVERSED YOGH;;01B8;;01B8 <=
 | ||
|           01BA;LATIN SMALL LETTER EZH WITH TAIL;Ll;0;L;;;;;N;LATIN SMALL LETTER YOGH WITH TAIL;;;; <=
 | ||
|           01BB;LATIN LETTER TWO WITH STROKE;Lo;0;L;;;;;N;LATIN LETTER TWO BAR;;;; X
 | ||
|           01BC;LATIN CAPITAL LETTER TONE FIVE;Lu;0;L;;;;;N;;;;01BD; => +1
 | ||
|           01BD;LATIN SMALL LETTER TONE FIVE;Ll;0;L;;;;;N;;;01BC;;01BC <=
 | ||
|           01BE;LATIN LETTER INVERTED GLOTTAL STOP WITH STROKE;Ll;0;L;;;;;N;LATIN LETTER INVERTED GLOTTAL STOP BAR;;;; X
 | ||
|           01BF;LATIN LETTER WYNN;Ll;0;L;;;;;N;;;01F7;;01F7  <=
 | ||
|           }
 | ||
|           #$B8,#$BC:
 | ||
|           begin
 | ||
|             if ord(c2) mod 2 = 0 then
 | ||
|               new_c2 := chr(ord(c2) + 1);
 | ||
|           end;
 | ||
|           #$B3..#$B6:
 | ||
|           begin
 | ||
|             if ord(c2) mod 2 = 1 then
 | ||
|               new_c2 := chr(ord(c2) + 1);
 | ||
|           end;
 | ||
|           #$B1:
 | ||
|           begin
 | ||
|             new_c1 := #$CA;
 | ||
|             new_c2 := #$8A;
 | ||
|           end;
 | ||
|           #$B2:
 | ||
|           begin
 | ||
|             new_c1 := #$CA;
 | ||
|             new_c2 := #$8B;
 | ||
|           end;
 | ||
|           #$B7:
 | ||
|           begin
 | ||
|             new_c1 := #$CA;
 | ||
|             new_c2 := #$92;
 | ||
|           end;
 | ||
|           end;
 | ||
|         end;
 | ||
|         #$C7:
 | ||
|         begin
 | ||
|           case c2 of
 | ||
|           #$84..#$8C,#$B1..#$B3:
 | ||
|           begin
 | ||
|             if (ord(c2) and $F) mod 3 = 1 then new_c2 := chr(ord(c2) + 2)
 | ||
|             else if (ord(c2) and $F) mod 3 = 2 then new_c2 := chr(ord(c2) + 1);
 | ||
|           end;
 | ||
|           #$8D..#$9C:
 | ||
|           begin
 | ||
|             if ord(c2) mod 2 = 1 then
 | ||
|               new_c2 := chr(ord(c2) + 1);
 | ||
|           end;
 | ||
|           #$9E..#$AF,#$B4..#$B5,#$B8..#$BF:
 | ||
|           begin
 | ||
|             if ord(c2) mod 2 = 0 then
 | ||
|               new_c2 := chr(ord(c2) + 1);
 | ||
|           end;
 | ||
|           {
 | ||
|           01F6;LATIN CAPITAL LETTER HWAIR;Lu;0;L;;;;;N;;;;0195;
 | ||
|           01F7;LATIN CAPITAL LETTER WYNN;Lu;0;L;;;;;N;;;;01BF;
 | ||
|           }
 | ||
|           #$B6:
 | ||
|           begin
 | ||
|             new_c1 := #$C6;
 | ||
|             new_c2 := #$95;
 | ||
|           end;
 | ||
|           #$B7:
 | ||
|           begin
 | ||
|             new_c1 := #$C6;
 | ||
|             new_c2 := #$BF;
 | ||
|           end;
 | ||
|           end;
 | ||
|         end;
 | ||
|         {
 | ||
|         Codepoints 0200 to 023F
 | ||
|         }
 | ||
|         #$C8:
 | ||
|         begin
 | ||
|           // For this one we can simply start with a default and override for some specifics
 | ||
|           if (c2 in [#$80..#$B3]) and (ord(c2) mod 2 = 0) then new_c2 := chr(ord(c2) + 1);
 | ||
| 
 | ||
|           case c2 of
 | ||
|           #$A0:
 | ||
|           begin
 | ||
|             new_c1 := #$C6;
 | ||
|             new_c2 := #$9E;
 | ||
|           end;
 | ||
|           #$A1: new_c2 := c2;
 | ||
|           {
 | ||
|           023A;LATIN CAPITAL LETTER A WITH STROKE;Lu;0;L;;;;;N;;;;2C65; => E2 B1 A5
 | ||
|           023B;LATIN CAPITAL LETTER C WITH STROKE;Lu;0;L;;;;;N;;;;023C; => +1
 | ||
|           023C;LATIN SMALL LETTER C WITH STROKE;Ll;0;L;;;;;N;;;023B;;023B <=
 | ||
|           023D;LATIN CAPITAL LETTER L WITH BAR;Lu;0;L;;;;;N;;;;019A; => C6 9A
 | ||
|           023E;LATIN CAPITAL LETTER T WITH DIAGONAL STROKE;Lu;0;L;;;;;N;;;;2C66; => E2 B1 A6
 | ||
|           023F;LATIN SMALL LETTER S WITH SWASH TAIL;Ll;0;L;;;;;N;;;2C7E;;2C7E <=
 | ||
|           0240;LATIN SMALL LETTER Z WITH SWASH TAIL;Ll;0;L;;;;;N;;;2C7F;;2C7F <=
 | ||
|           }
 | ||
|           #$BA,#$BE:
 | ||
|           begin
 | ||
|             p:= OutStr - PChar(Result);
 | ||
|             SetLength(Result,Length(Result)+1);// Increase the buffer
 | ||
|             OutStr := PChar(Result)+p;
 | ||
|             OutStr^ := #$E2;
 | ||
|             inc(OutStr);
 | ||
|             OutStr^ := #$B1;
 | ||
|             inc(OutStr);
 | ||
|             if c2 = #$BA then OutStr^ := #$A5
 | ||
|             else OutStr^ := #$A6;
 | ||
|             dec(CounterDiff);
 | ||
|             inc(OutStr);
 | ||
|             inc(InStr, 2);
 | ||
|             Continue;
 | ||
|           end;
 | ||
|           #$BD:
 | ||
|           begin
 | ||
|             new_c1 := #$C6;
 | ||
|             new_c2 := #$9A;
 | ||
|           end;
 | ||
|           #$BB: new_c2 := chr(ord(c2) + 1);
 | ||
|           end;
 | ||
|         end;
 | ||
|         {
 | ||
|         Codepoints 0240 to 027F
 | ||
| 
 | ||
|         Here only 0240..024F needs lowercase
 | ||
|         }
 | ||
|         #$C9:
 | ||
|         begin
 | ||
|           case c2 of
 | ||
|           #$81..#$82:
 | ||
|           begin
 | ||
|             if ord(c2) mod 2 = 1 then
 | ||
|               new_c2 := chr(ord(c2) + 1);
 | ||
|           end;
 | ||
|           #$86..#$8F:
 | ||
|           begin
 | ||
|             if ord(c2) mod 2 = 0 then
 | ||
|               new_c2 := chr(ord(c2) + 1);
 | ||
|           end;
 | ||
|           #$83:
 | ||
|           begin
 | ||
|             new_c1 := #$C6;
 | ||
|             new_c2 := #$80;
 | ||
|           end;
 | ||
|           #$84:
 | ||
|           begin
 | ||
|             new_c1 := #$CA;
 | ||
|             new_c2 := #$89;
 | ||
|           end;
 | ||
|           #$85:
 | ||
|           begin
 | ||
|             new_c1 := #$CA;
 | ||
|             new_c2 := #$8C;
 | ||
|           end;
 | ||
|           end;
 | ||
|         end;
 | ||
|         // $CE91..$CE9F: NewChar := OldChar + $20; // Greek Characters
 | ||
|         // $CEA0..$CEA9: NewChar := OldChar + $E0; // Greek Characters
 | ||
|         #$CE:
 | ||
|         begin
 | ||
|           case c2 of
 | ||
|             // 0380 = CE 80
 | ||
|             #$86: new_c2 := #$AC;
 | ||
|             #$88: new_c2 := #$AD;
 | ||
|             #$89: new_c2 := #$AE;
 | ||
|             #$8A: new_c2 := #$AF;
 | ||
|             #$8C: new_c1 := #$CF; // By coincidence new_c2 remains the same
 | ||
|             #$8E:
 | ||
|             begin
 | ||
|               new_c1 := #$CF;
 | ||
|               new_c2 := #$8D;
 | ||
|             end;
 | ||
|             #$8F:
 | ||
|             begin
 | ||
|               new_c1 := #$CF;
 | ||
|               new_c2 := #$8E;
 | ||
|             end;
 | ||
|             // 0390 = CE 90
 | ||
|             #$91..#$9F:
 | ||
|             begin
 | ||
|               new_c2 := chr(ord(c2) + $20);
 | ||
|             end;
 | ||
|             // 03A0 = CE A0
 | ||
|             #$A0..#$AB:
 | ||
|             begin
 | ||
|               new_c1 := #$CF;
 | ||
|               new_c2 := chr(ord(c2) - $20);
 | ||
|             end;
 | ||
|           end;
 | ||
|         end;
 | ||
|         // 03C0 = CF 80
 | ||
|         // 03D0 = CF 90
 | ||
|         // 03E0 = CF A0
 | ||
|         // 03F0 = CF B0
 | ||
|         #$CF:
 | ||
|         begin
 | ||
|           case c2 of
 | ||
|             // 03CF;GREEK CAPITAL KAI SYMBOL;Lu;0;L;;;;;N;;;;03D7; CF 8F => CF 97
 | ||
|             #$8F: new_c2 := #$97;
 | ||
|             // 03D8;GREEK LETTER ARCHAIC KOPPA;Lu;0;L;;;;;N;;;;03D9;
 | ||
|             #$98: new_c2 := #$99;
 | ||
|             // 03DA;GREEK LETTER STIGMA;Lu;0;L;;;;;N;GREEK CAPITAL LETTER STIGMA;;;03DB;
 | ||
|             #$9A: new_c2 := #$9B;
 | ||
|             // 03DC;GREEK LETTER DIGAMMA;Lu;0;L;;;;;N;GREEK CAPITAL LETTER DIGAMMA;;;03DD;
 | ||
|             #$9C: new_c2 := #$9D;
 | ||
|             // 03DE;GREEK LETTER KOPPA;Lu;0;L;;;;;N;GREEK CAPITAL LETTER KOPPA;;;03DF;
 | ||
|             #$9E: new_c2 := #$9F;
 | ||
|             {
 | ||
|             03E0;GREEK LETTER SAMPI;Lu;0;L;;;;;N;GREEK CAPITAL LETTER SAMPI;;;03E1;
 | ||
|             03E1;GREEK SMALL LETTER SAMPI;Ll;0;L;;;;;N;;;03E0;;03E0
 | ||
|             03E2;COPTIC CAPITAL LETTER SHEI;Lu;0;L;;;;;N;GREEK CAPITAL LETTER SHEI;;;03E3;
 | ||
|             03E3;COPTIC SMALL LETTER SHEI;Ll;0;L;;;;;N;GREEK SMALL LETTER SHEI;;03E2;;03E2
 | ||
|             ...
 | ||
|             03EE;COPTIC CAPITAL LETTER DEI;Lu;0;L;;;;;N;GREEK CAPITAL LETTER DEI;;;03EF;
 | ||
|             03EF;COPTIC SMALL LETTER DEI;Ll;0;L;;;;;N;GREEK SMALL LETTER DEI;;03EE;;03EE
 | ||
|             }
 | ||
|             #$A0..#$AF: if ord(c2) mod 2 = 0 then
 | ||
|                           new_c2 := chr(ord(c2) + 1);
 | ||
|             // 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8;
 | ||
|             #$B4:
 | ||
|             begin
 | ||
|               new_c1 := #$CE;
 | ||
|               new_c2 := #$B8;
 | ||
|             end;
 | ||
|             // 03F7;GREEK CAPITAL LETTER SHO;Lu;0;L;;;;;N;;;;03F8;
 | ||
|             #$B7: new_c2 := #$B8;
 | ||
|             // 03F9;GREEK CAPITAL LUNATE SIGMA SYMBOL;Lu;0;L;<compat> 03A3;;;;N;;;;03F2;
 | ||
|             #$B9: new_c2 := #$B2;
 | ||
|             // 03FA;GREEK CAPITAL LETTER SAN;Lu;0;L;;;;;N;;;;03FB;
 | ||
|             #$BA: new_c2 := #$BB;
 | ||
|             // 03FD;GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL;Lu;0;L;;;;;N;;;;037B;
 | ||
|             #$BD:
 | ||
|             begin
 | ||
|               new_c1 := #$CD;
 | ||
|               new_c2 := #$BB;
 | ||
|             end;
 | ||
|             // 03FE;GREEK CAPITAL DOTTED LUNATE SIGMA SYMBOL;Lu;0;L;;;;;N;;;;037C;
 | ||
|             #$BE:
 | ||
|             begin
 | ||
|               new_c1 := #$CD;
 | ||
|               new_c2 := #$BC;
 | ||
|             end;
 | ||
|             // 03FF;GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL;Lu;0;L;;;;;N;;;;037D;
 | ||
|             #$BF:
 | ||
|             begin
 | ||
|               new_c1 := #$CD;
 | ||
|               new_c2 := #$BD;
 | ||
|             end;
 | ||
|           end;
 | ||
|         end;
 | ||
|         // $D080..$D08F: NewChar := OldChar + $110; // Cyrillic alphabet
 | ||
|         // $D090..$D09F: NewChar := OldChar + $20; // Cyrillic alphabet
 | ||
|         // $D0A0..$D0AF: NewChar := OldChar + $E0; // Cyrillic alphabet
 | ||
|         #$D0:
 | ||
|         begin
 | ||
|           c2 := InStr[1];
 | ||
|           case c2 of
 | ||
|             #$80..#$8F:
 | ||
|             begin
 | ||
|               new_c1 := chr(ord(c1)+1);
 | ||
|               new_c2  := chr(ord(c2) + $10);
 | ||
|             end;
 | ||
|             #$90..#$9F:
 | ||
|             begin
 | ||
|               new_c2 := chr(ord(c2) + $20);
 | ||
|             end;
 | ||
|             #$A0..#$AF:
 | ||
|             begin
 | ||
|               new_c1 := chr(ord(c1)+1);
 | ||
|               new_c2 := chr(ord(c2) - $20);
 | ||
|             end;
 | ||
|           end;
 | ||
|         end;
 | ||
|         // Archaic and non-slavic cyrillic 460-47F = D1A0-D1BF
 | ||
|         // These require just adding 1 to get the lowercase
 | ||
|         #$D1:
 | ||
|         begin
 | ||
|           if (c2 in [#$A0..#$BF]) and (ord(c2) mod 2 = 0) then
 | ||
|             new_c2 := chr(ord(c2) + 1);
 | ||
|         end;
 | ||
|         // Archaic and non-slavic cyrillic 480-4BF = D280-D2BF
 | ||
|         // These mostly require just adding 1 to get the lowercase
 | ||
|         #$D2:
 | ||
|         begin
 | ||
|           case c2 of
 | ||
|             #$80:
 | ||
|             begin
 | ||
|               new_c2 := chr(ord(c2) + 1);
 | ||
|             end;
 | ||
|             // #$81 is already lowercase
 | ||
|             // #$82-#$89 ???
 | ||
|             #$8A..#$BF:
 | ||
|             begin
 | ||
|               if ord(c2) mod 2 = 0 then
 | ||
|                 new_c2 := chr(ord(c2) + 1);
 | ||
|             end;
 | ||
|           end;
 | ||
|         end;
 | ||
|         {
 | ||
|         Codepoints  04C0..04FF
 | ||
|         }
 | ||
|         #$D3:
 | ||
|         begin
 | ||
|           case c2 of
 | ||
|             #$80: new_c2 := #$8F;
 | ||
|             #$81..#$8E:
 | ||
|             begin
 | ||
|               if ord(c2) mod 2 = 1 then
 | ||
|                 new_c2 := chr(ord(c2) + 1);
 | ||
|             end;
 | ||
|             #$90..#$BF:
 | ||
|             begin
 | ||
|               if ord(c2) mod 2 = 0 then
 | ||
|                 new_c2 := chr(ord(c2) + 1);
 | ||
|             end;
 | ||
|           end;
 | ||
|         end;
 | ||
|         {
 | ||
|         Codepoints  0500..053F
 | ||
| 
 | ||
|         Armenian starts in 0531
 | ||
|         }
 | ||
|         #$D4:
 | ||
|         begin
 | ||
|           if ord(c2) mod 2 = 0 then
 | ||
|             new_c2 := chr(ord(c2) + 1);
 | ||
| 
 | ||
|           // Armenian
 | ||
|           if c2 in [#$B1..#$BF] then
 | ||
|           begin
 | ||
|             new_c1 := #$D5;
 | ||
|             new_c2 := chr(ord(c2) - $10);
 | ||
|           end;
 | ||
|         end;
 | ||
|         {
 | ||
|         Codepoints  0540..057F
 | ||
| 
 | ||
|         Armenian
 | ||
|         }
 | ||
|         #$D5:
 | ||
|         begin
 | ||
|           case c2 of
 | ||
|             #$80..#$8F:
 | ||
|             begin
 | ||
|               new_c2 := chr(ord(c2) + $30);
 | ||
|             end;
 | ||
|             #$90..#$96:
 | ||
|             begin
 | ||
|               new_c1 := #$D6;
 | ||
|               new_c2 := chr(ord(c2) - $10);
 | ||
|             end;
 | ||
|           end;
 | ||
|         end;
 | ||
|         end;
 | ||
|         // Common code 2-byte modifiable chars
 | ||
|         if (CounterDiff <> 0) then
 | ||
|         begin
 | ||
|           OutStr^ := new_c1;
 | ||
|           OutStr[1] := new_c2;
 | ||
|         end
 | ||
|         else
 | ||
|         begin
 | ||
|           if (new_c1 <> c1) then OutStr^ := new_c1;
 | ||
|           if (new_c2 <> c2) then OutStr[1] := new_c2;
 | ||
|         end;
 | ||
|         inc(InStr, 2);
 | ||
|         inc(OutStr, 2);
 | ||
|       end;
 | ||
|       {
 | ||
|       Characters with 3 bytes
 | ||
|       }
 | ||
|       #$E1:
 | ||
|       begin
 | ||
|         new_c1 := c1;
 | ||
|         c2 := InStr[1];
 | ||
|         c3 := InStr[2];
 | ||
|         new_c2 := c2;
 | ||
|         new_c3 := c3;
 | ||
|         {
 | ||
|         Georgian codepoints 10A0-10C5 => 2D00-2D25
 | ||
| 
 | ||
|         In UTF-8 this is:
 | ||
|         E1 82 A0 - E1 82 BF => E2 B4 80 - E2 B4 9F
 | ||
|         E1 83 80 - E1 83 85 => E2 B4 A0 - E2 B4 A5
 | ||
|         }
 | ||
|         case c2 of
 | ||
|         #$82:
 | ||
|         if (c3 in [#$A0..#$BF]) then
 | ||
|         begin
 | ||
|           new_c1 := #$E2;
 | ||
|           new_c2 := #$B4;
 | ||
|           new_c3 := chr(ord(c3) - $20);
 | ||
|         end;
 | ||
|         #$83:
 | ||
|         if (c3 in [#$80..#$85]) then
 | ||
|         begin
 | ||
|           new_c1 := #$E2;
 | ||
|           new_c2 := #$B4;
 | ||
|           new_c3 := chr(ord(c3) + $20);
 | ||
|         end;
 | ||
|         {
 | ||
|         Extra chars between 1E00..1EFF
 | ||
| 
 | ||
|         Blocks of chars:
 | ||
|           1E00..1E3F    E1 B8 80..E1 B8 BF
 | ||
|           1E40..1E7F    E1 B9 80..E1 B9 BF
 | ||
|           1E80..1EBF    E1 BA 80..E1 BA BF
 | ||
|           1EC0..1EFF    E1 BB 80..E1 BB BF
 | ||
|         }
 | ||
|         #$B8..#$BB:
 | ||
|         begin
 | ||
|           // Start with a default and change for some particular chars
 | ||
|           if ord(c3) mod 2 = 0 then
 | ||
|             new_c3 := chr(ord(c3) + 1);
 | ||
| 
 | ||
|           { Only 1E96..1E9F are different E1 BA 96..E1 BA 9F
 | ||
| 
 | ||
|           1E96;LATIN SMALL LETTER H WITH LINE BELOW;Ll;0;L;0068 0331;;;;N;;;;;
 | ||
|           1E97;LATIN SMALL LETTER T WITH DIAERESIS;Ll;0;L;0074 0308;;;;N;;;;;
 | ||
|           1E98;LATIN SMALL LETTER W WITH RING ABOVE;Ll;0;L;0077 030A;;;;N;;;;;
 | ||
|           1E99;LATIN SMALL LETTER Y WITH RING ABOVE;Ll;0;L;0079 030A;;;;N;;;;;
 | ||
|           1E9A;LATIN SMALL LETTER A WITH RIGHT HALF RING;Ll;0;L;<compat> 0061 02BE;;;;N;;;;;
 | ||
|           1E9B;LATIN SMALL LETTER LONG S WITH DOT ABOVE;Ll;0;L;017F 0307;;;;N;;;1E60;;1E60
 | ||
|           1E9C;LATIN SMALL LETTER LONG S WITH DIAGONAL STROKE;Ll;0;L;;;;;N;;;;;
 | ||
|           1E9D;LATIN SMALL LETTER LONG S WITH HIGH STROKE;Ll;0;L;;;;;N;;;;;
 | ||
|           1E9E;LATIN CAPITAL LETTER SHARP S;Lu;0;L;;;;;N;;;;00DF; => C3 9F
 | ||
|           1E9F;LATIN SMALL LETTER DELTA;Ll;0;L;;;;;N;;;;;
 | ||
|           }
 | ||
|           if (c2 = #$BA) and (c3 in [#$96..#$9F]) then new_c3 := c3;
 | ||
|           // LATIN CAPITAL LETTER SHARP S => to german Beta
 | ||
|           if (c2 = #$BA) and (c3 = #$9E) then
 | ||
|           begin
 | ||
|             inc(InStr, 3);
 | ||
|             OutStr^ := #$C3;
 | ||
|             inc(OutStr);
 | ||
|             OutStr^ := #$9F;
 | ||
|             inc(OutStr);
 | ||
|             inc(CounterDiff, 1);
 | ||
|             Continue;
 | ||
|           end;
 | ||
|         end;
 | ||
|         {
 | ||
|         Extra chars between 1F00..1FFF
 | ||
| 
 | ||
|         Blocks of chars:
 | ||
|           1E00..1E3F    E1 BC 80..E1 BC BF
 | ||
|           1E40..1E7F    E1 BD 80..E1 BD BF
 | ||
|           1E80..1EBF    E1 BE 80..E1 BE BF
 | ||
|           1EC0..1EFF    E1 BF 80..E1 BF BF
 | ||
|         }
 | ||
|         #$BC:
 | ||
|         begin
 | ||
|           // Start with a default and change for some particular chars
 | ||
|           if (ord(c3) mod $10) div 8 = 1 then
 | ||
|             new_c3 := chr(ord(c3) - 8);
 | ||
|         end;
 | ||
|         #$BD:
 | ||
|         begin
 | ||
|           // Start with a default and change for some particular chars
 | ||
|           case c3 of
 | ||
|           #$80..#$8F, #$A0..#$AF: if (ord(c3) mod $10) div 8 = 1 then
 | ||
|                         new_c3 := chr(ord(c3) - 8);
 | ||
|           {
 | ||
|           1F50;GREEK SMALL LETTER UPSILON WITH PSILI;Ll;0;L;03C5 0313;;;;N;;;;;
 | ||
|           1F51;GREEK SMALL LETTER UPSILON WITH DASIA;Ll;0;L;03C5 0314;;;;N;;;1F59;;1F59
 | ||
|           1F52;GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA;Ll;0;L;1F50 0300;;;;N;;;;;
 | ||
|           1F53;GREEK SMALL LETTER UPSILON WITH DASIA AND VARIA;Ll;0;L;1F51 0300;;;;N;;;1F5B;;1F5B
 | ||
|           1F54;GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA;Ll;0;L;1F50 0301;;;;N;;;;;
 | ||
|           1F55;GREEK SMALL LETTER UPSILON WITH DASIA AND OXIA;Ll;0;L;1F51 0301;;;;N;;;1F5D;;1F5D
 | ||
|           1F56;GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI;Ll;0;L;1F50 0342;;;;N;;;;;
 | ||
|           1F57;GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI;Ll;0;L;1F51 0342;;;;N;;;1F5F;;1F5F
 | ||
|           1F59;GREEK CAPITAL LETTER UPSILON WITH DASIA;Lu;0;L;03A5 0314;;;;N;;;;1F51;
 | ||
|           1F5B;GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA;Lu;0;L;1F59 0300;;;;N;;;;1F53;
 | ||
|           1F5D;GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA;Lu;0;L;1F59 0301;;;;N;;;;1F55;
 | ||
|           1F5F;GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI;Lu;0;L;1F59 0342;;;;N;;;;1F57;
 | ||
|           }
 | ||
|           #$99,#$9B,#$9D,#$9F: new_c3 := chr(ord(c3) - 8);
 | ||
|           end;
 | ||
|         end;
 | ||
|         #$BE:
 | ||
|         begin
 | ||
|           // Start with a default and change for some particular chars
 | ||
|           case c3 of
 | ||
|           #$80..#$B9: if (ord(c3) mod $10) div 8 = 1 then
 | ||
|                         new_c3 := chr(ord(c3) - 8);
 | ||
|           {
 | ||
|           1FB0;GREEK SMALL LETTER ALPHA WITH VRACHY;Ll;0;L;03B1 0306;;;;N;;;1FB8;;1FB8
 | ||
|           1FB1;GREEK SMALL LETTER ALPHA WITH MACRON;Ll;0;L;03B1 0304;;;;N;;;1FB9;;1FB9
 | ||
|           1FB2;GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI;Ll;0;L;1F70 0345;;;;N;;;;;
 | ||
|           1FB3;GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI;Ll;0;L;03B1 0345;;;;N;;;1FBC;;1FBC
 | ||
|           1FB4;GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI;Ll;0;L;03AC 0345;;;;N;;;;;
 | ||
|           1FB6;GREEK SMALL LETTER ALPHA WITH PERISPOMENI;Ll;0;L;03B1 0342;;;;N;;;;;
 | ||
|           1FB7;GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI;Ll;0;L;1FB6 0345;;;;N;;;;;
 | ||
|           1FB8;GREEK CAPITAL LETTER ALPHA WITH VRACHY;Lu;0;L;0391 0306;;;;N;;;;1FB0;
 | ||
|           1FB9;GREEK CAPITAL LETTER ALPHA WITH MACRON;Lu;0;L;0391 0304;;;;N;;;;1FB1;
 | ||
|           1FBA;GREEK CAPITAL LETTER ALPHA WITH VARIA;Lu;0;L;0391 0300;;;;N;;;;1F70;
 | ||
|           1FBB;GREEK CAPITAL LETTER ALPHA WITH OXIA;Lu;0;L;0386;;;;N;;;;1F71;
 | ||
|           1FBC;GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI;Lt;0;L;0391 0345;;;;N;;;;1FB3;
 | ||
|           1FBD;GREEK KORONIS;Sk;0;ON;<compat> 0020 0313;;;;N;;;;;
 | ||
|           1FBE;GREEK PROSGEGRAMMENI;Ll;0;L;03B9;;;;N;;;0399;;0399
 | ||
|           1FBF;GREEK PSILI;Sk;0;ON;<compat> 0020 0313;;;;N;;;;;
 | ||
|           }
 | ||
|           #$BA:
 | ||
|           begin
 | ||
|             new_c2 := #$BD;
 | ||
|             new_c3 := #$B0;
 | ||
|           end;
 | ||
|           #$BB:
 | ||
|           begin
 | ||
|             new_c2 := #$BD;
 | ||
|             new_c3 := #$B1;
 | ||
|           end;
 | ||
|           #$BC: new_c3 := #$B3;
 | ||
|           end;
 | ||
|         end;
 | ||
|         end;
 | ||
| 
 | ||
|         if (CounterDiff <> 0) then
 | ||
|         begin
 | ||
|           OutStr^ := new_c1;
 | ||
|           OutStr[1] := new_c2;
 | ||
|           OutStr[2] := new_c3;
 | ||
|         end
 | ||
|         else
 | ||
|         begin
 | ||
|           if c1 <> new_c1 then OutStr^ := new_c1;
 | ||
|           if c2 <> new_c2 then OutStr[1] := new_c2;
 | ||
|           if c3 <> new_c3 then OutStr[2] := new_c3;
 | ||
|         end;
 | ||
| 
 | ||
|         inc(InStr, 3);
 | ||
|         inc(OutStr, 3);
 | ||
|       end;
 | ||
|       {
 | ||
|       More Characters with 3 bytes, so exotic stuff between:
 | ||
|       $2126..$2183                    E2 84 A6..E2 86 83
 | ||
|       $24B6..$24CF    Result:=u+26;   E2 92 B6..E2 93 8F
 | ||
|       $2C00..$2C2E    Result:=u+48;   E2 B0 80..E2 B0 AE
 | ||
|       $2C60..$2CE2                    E2 B1 A0..E2 B3 A2
 | ||
|       }
 | ||
|       #$E2:
 | ||
|       begin
 | ||
|         new_c1 := c1;
 | ||
|         c2 := InStr[1];
 | ||
|         c3 := InStr[2];
 | ||
|         new_c2 := c2;
 | ||
|         new_c3 := c3;
 | ||
|         // 2126;OHM SIGN;Lu;0;L;03A9;;;;N;OHM;;;03C9; E2 84 A6 => CF 89
 | ||
|         if (c2 = #$84) and (c3 = #$A6) then
 | ||
|         begin
 | ||
|           inc(InStr, 3);
 | ||
|           OutStr^ := #$CF;
 | ||
|           inc(OutStr);
 | ||
|           OutStr^ := #$89;
 | ||
|           inc(OutStr);
 | ||
|           inc(CounterDiff, 1);
 | ||
|           Continue;
 | ||
|         end
 | ||
|         {
 | ||
|         212A;KELVIN SIGN;Lu;0;L;004B;;;;N;DEGREES KELVIN;;;006B; E2 84 AA => 6B
 | ||
|         }
 | ||
|         else if (c2 = #$84) and (c3 = #$AA) then
 | ||
|         begin
 | ||
|           inc(InStr, 3);
 | ||
|           if c3 = #$AA then OutStr^ := #$6B
 | ||
|           else OutStr^ := #$E5;
 | ||
|           inc(OutStr);
 | ||
|           inc(CounterDiff, 2);
 | ||
|           Continue;
 | ||
|         end
 | ||
|         {
 | ||
|         212B;ANGSTROM SIGN;Lu;0;L;00C5;;;;N;ANGSTROM UNIT;;;00E5; E2 84 AB => C3 A5
 | ||
|         }
 | ||
|         else if (c2 = #$84) and (c3 = #$AB) then
 | ||
|         begin
 | ||
|           inc(InStr, 3);
 | ||
|           OutStr^ := #$C3;
 | ||
|           inc(OutStr);
 | ||
|           OutStr^ := #$A5;
 | ||
|           inc(OutStr);
 | ||
|           inc(CounterDiff, 1);
 | ||
|           Continue;
 | ||
|         end
 | ||
|         {
 | ||
|         2160;ROMAN NUMERAL ONE;Nl;0;L;<compat> 0049;;;1;N;;;;2170; E2 85 A0 => E2 85 B0
 | ||
|         2161;ROMAN NUMERAL TWO;Nl;0;L;<compat> 0049 0049;;;2;N;;;;2171;
 | ||
|         2162;ROMAN NUMERAL THREE;Nl;0;L;<compat> 0049 0049 0049;;;3;N;;;;2172;
 | ||
|         2163;ROMAN NUMERAL FOUR;Nl;0;L;<compat> 0049 0056;;;4;N;;;;2173;
 | ||
|         2164;ROMAN NUMERAL FIVE;Nl;0;L;<compat> 0056;;;5;N;;;;2174;
 | ||
|         2165;ROMAN NUMERAL SIX;Nl;0;L;<compat> 0056 0049;;;6;N;;;;2175;
 | ||
|         2166;ROMAN NUMERAL SEVEN;Nl;0;L;<compat> 0056 0049 0049;;;7;N;;;;2176;
 | ||
|         2167;ROMAN NUMERAL EIGHT;Nl;0;L;<compat> 0056 0049 0049 0049;;;8;N;;;;2177;
 | ||
|         2168;ROMAN NUMERAL NINE;Nl;0;L;<compat> 0049 0058;;;9;N;;;;2178;
 | ||
|         2169;ROMAN NUMERAL TEN;Nl;0;L;<compat> 0058;;;10;N;;;;2179;
 | ||
|         216A;ROMAN NUMERAL ELEVEN;Nl;0;L;<compat> 0058 0049;;;11;N;;;;217A;
 | ||
|         216B;ROMAN NUMERAL TWELVE;Nl;0;L;<compat> 0058 0049 0049;;;12;N;;;;217B;
 | ||
|         216C;ROMAN NUMERAL FIFTY;Nl;0;L;<compat> 004C;;;50;N;;;;217C;
 | ||
|         216D;ROMAN NUMERAL ONE HUNDRED;Nl;0;L;<compat> 0043;;;100;N;;;;217D;
 | ||
|         216E;ROMAN NUMERAL FIVE HUNDRED;Nl;0;L;<compat> 0044;;;500;N;;;;217E;
 | ||
|         216F;ROMAN NUMERAL ONE THOUSAND;Nl;0;L;<compat> 004D;;;1000;N;;;;217F;
 | ||
|         }
 | ||
|         else if (c2 = #$85) and (c3 in [#$A0..#$AF]) then new_c3 := chr(ord(c3) + $10)
 | ||
|         {
 | ||
|         2183;ROMAN NUMERAL REVERSED ONE HUNDRED;Lu;0;L;;;;;N;;;;2184; E2 86 83 => E2 86 84
 | ||
|         }
 | ||
|         else if (c2 = #$86) and (c3 = #$83) then new_c3 := chr(ord(c3) + 1)
 | ||
|         {
 | ||
|         $24B6..$24CF    Result:=u+26;   E2 92 B6..E2 93 8F
 | ||
| 
 | ||
|         Ex: 24B6;CIRCLED LATIN CAPITAL LETTER A;So;0;L;<circle> 0041;;;;N;;;;24D0; E2 92 B6 => E2 93 90
 | ||
|         }
 | ||
|         else if (c2 = #$92) and (c3 in [#$B6..#$BF]) then
 | ||
|         begin
 | ||
|           new_c3 := #$93;
 | ||
|           new_c3 := chr(ord(c3) - $26);
 | ||
|         end
 | ||
|         else if (c2 = #$93) and (c3 in [#$80..#$8F]) then new_c3 := chr(ord(c3) + 26)
 | ||
|         {
 | ||
|         $2C00..$2C2E    Result:=u+48;   E2 B0 80..E2 B0 AE
 | ||
| 
 | ||
|         2C00;GLAGOLITIC CAPITAL LETTER AZU;Lu;0;L;;;;;N;;;;2C30; E2 B0 80 => E2 B0 B0
 | ||
| 
 | ||
|         2C10;GLAGOLITIC CAPITAL LETTER NASHI;Lu;0;L;;;;;N;;;;2C40; E2 B0 90 => E2 B1 80
 | ||
|         }
 | ||
|         else if (c2 = #$B0) and (c3 in [#$80..#$8F]) then new_c3 := chr(ord(c3) + $30)
 | ||
|         else if (c2 = #$B0) and (c3 in [#$90..#$AE]) then
 | ||
|         begin
 | ||
|           new_c2 := #$B1;
 | ||
|           new_c3 := chr(ord(c3) - $10);
 | ||
|         end
 | ||
|         {
 | ||
|         $2C60..$2CE2                    E2 B1 A0..E2 B3 A2
 | ||
| 
 | ||
|         2C60;LATIN CAPITAL LETTER L WITH DOUBLE BAR;Lu;0;L;;;;;N;;;;2C61; E2 B1 A0 => +1
 | ||
|         2C61;LATIN SMALL LETTER L WITH DOUBLE BAR;Ll;0;L;;;;;N;;;2C60;;2C60
 | ||
|         2C62;LATIN CAPITAL LETTER L WITH MIDDLE TILDE;Lu;0;L;;;;;N;;;;026B; => 	C9 AB
 | ||
|         2C63;LATIN CAPITAL LETTER P WITH STROKE;Lu;0;L;;;;;N;;;;1D7D; => E1 B5 BD
 | ||
|         2C64;LATIN CAPITAL LETTER R WITH TAIL;Lu;0;L;;;;;N;;;;027D; => 	C9 BD
 | ||
|         2C65;LATIN SMALL LETTER A WITH STROKE;Ll;0;L;;;;;N;;;023A;;023A
 | ||
|         2C66;LATIN SMALL LETTER T WITH DIAGONAL STROKE;Ll;0;L;;;;;N;;;023E;;023E
 | ||
|         2C67;LATIN CAPITAL LETTER H WITH DESCENDER;Lu;0;L;;;;;N;;;;2C68; => E2 B1 A8
 | ||
|         2C68;LATIN SMALL LETTER H WITH DESCENDER;Ll;0;L;;;;;N;;;2C67;;2C67
 | ||
|         2C69;LATIN CAPITAL LETTER K WITH DESCENDER;Lu;0;L;;;;;N;;;;2C6A; => E2 B1 AA
 | ||
|         2C6A;LATIN SMALL LETTER K WITH DESCENDER;Ll;0;L;;;;;N;;;2C69;;2C69
 | ||
|         2C6B;LATIN CAPITAL LETTER Z WITH DESCENDER;Lu;0;L;;;;;N;;;;2C6C; => E2 B1 AC
 | ||
|         2C6C;LATIN SMALL LETTER Z WITH DESCENDER;Ll;0;L;;;;;N;;;2C6B;;2C6B
 | ||
|         2C6D;LATIN CAPITAL LETTER ALPHA;Lu;0;L;;;;;N;;;;0251; => C9 91
 | ||
|         2C6E;LATIN CAPITAL LETTER M WITH HOOK;Lu;0;L;;;;;N;;;;0271; => C9 B1
 | ||
|         2C6F;LATIN CAPITAL LETTER TURNED A;Lu;0;L;;;;;N;;;;0250; => C9 90
 | ||
| 
 | ||
|         2C70;LATIN CAPITAL LETTER TURNED ALPHA;Lu;0;L;;;;;N;;;;0252; => C9 92
 | ||
|         }
 | ||
|         else if (c2 = #$B1) then
 | ||
|         begin
 | ||
|           case c3 of
 | ||
|           #$A0: new_c3 := chr(ord(c3)+1);
 | ||
|           #$A2,#$A4,#$AD..#$AF,#$B0:
 | ||
|           begin
 | ||
|             inc(InStr, 3);
 | ||
|             OutStr^ := #$C9;
 | ||
|             inc(OutStr);
 | ||
|             case c3 of
 | ||
|             #$A2: OutStr^ := #$AB;
 | ||
|             #$A4: OutStr^ := #$BD;
 | ||
|             #$AD: OutStr^ := #$90;
 | ||
|             #$AE: OutStr^ := #$B1;
 | ||
|             #$AF: OutStr^ := #$90;
 | ||
|             #$B0: OutStr^ := #$92;
 | ||
|             end;
 | ||
|             inc(OutStr);
 | ||
|             inc(CounterDiff, 1);
 | ||
|             Continue;
 | ||
|           end;
 | ||
|           #$A3:
 | ||
|           begin
 | ||
|             new_c2 := #$B5;
 | ||
|             new_c3 := #$BD;
 | ||
|           end;
 | ||
|           #$A7,#$A9,#$AB: new_c3 := chr(ord(c3)+1);
 | ||
|           {
 | ||
|           2C71;LATIN SMALL LETTER V WITH RIGHT HOOK;Ll;0;L;;;;;N;;;;;
 | ||
|           2C72;LATIN CAPITAL LETTER W WITH HOOK;Lu;0;L;;;;;N;;;;2C73;
 | ||
|           2C73;LATIN SMALL LETTER W WITH HOOK;Ll;0;L;;;;;N;;;2C72;;2C72
 | ||
|           2C74;LATIN SMALL LETTER V WITH CURL;Ll;0;L;;;;;N;;;;;
 | ||
|           2C75;LATIN CAPITAL LETTER HALF H;Lu;0;L;;;;;N;;;;2C76;
 | ||
|           2C76;LATIN SMALL LETTER HALF H;Ll;0;L;;;;;N;;;2C75;;2C75
 | ||
|           2C77;LATIN SMALL LETTER TAILLESS PHI;Ll;0;L;;;;;N;;;;;
 | ||
|           2C78;LATIN SMALL LETTER E WITH NOTCH;Ll;0;L;;;;;N;;;;;
 | ||
|           2C79;LATIN SMALL LETTER TURNED R WITH TAIL;Ll;0;L;;;;;N;;;;;
 | ||
|           2C7A;LATIN SMALL LETTER O WITH LOW RING INSIDE;Ll;0;L;;;;;N;;;;;
 | ||
|           2C7B;LATIN LETTER SMALL CAPITAL TURNED E;Ll;0;L;;;;;N;;;;;
 | ||
|           2C7C;LATIN SUBSCRIPT SMALL LETTER J;Ll;0;L;<sub> 006A;;;;N;;;;;
 | ||
|           2C7D;MODIFIER LETTER CAPITAL V;Lm;0;L;<super> 0056;;;;N;;;;;
 | ||
|           2C7E;LATIN CAPITAL LETTER S WITH SWASH TAIL;Lu;0;L;;;;;N;;;;023F; => C8 BF
 | ||
|           2C7F;LATIN CAPITAL LETTER Z WITH SWASH TAIL;Lu;0;L;;;;;N;;;;0240; => C9 80
 | ||
|           }
 | ||
|           #$B2,#$B5: new_c3 := chr(ord(c3)+1);
 | ||
|           #$BE,#$BF:
 | ||
|           begin
 | ||
|             inc(InStr, 3);
 | ||
|             case c3 of
 | ||
|             #$BE: OutStr^ := #$C8;
 | ||
|             #$BF: OutStr^ := #$C9;
 | ||
|             end;
 | ||
|             OutStr^ := #$C8;
 | ||
|             inc(OutStr);
 | ||
|             case c3 of
 | ||
|             #$BE: OutStr^ := #$BF;
 | ||
|             #$BF: OutStr^ := #$80;
 | ||
|             end;
 | ||
|             inc(OutStr);
 | ||
|             inc(CounterDiff, 1);
 | ||
|             Continue;
 | ||
|           end;
 | ||
|           end;
 | ||
|         end
 | ||
|         {
 | ||
|         2C80;COPTIC CAPITAL LETTER ALFA;Lu;0;L;;;;;N;;;;2C81; E2 B2 80 => E2 B2 81
 | ||
|         ...
 | ||
|         2CBE;COPTIC CAPITAL LETTER OLD COPTIC OOU;Lu;0;L;;;;;N;;;;2CBF; E2 B2 BE => E2 B2 BF
 | ||
|         2CBF;COPTIC SMALL LETTER OLD COPTIC OOU;Ll;0;L;;;;;N;;;2CBE;;2CBE
 | ||
|         ...
 | ||
|         2CC0;COPTIC CAPITAL LETTER SAMPI;Lu;0;L;;;;;N;;;;2CC1; E2 B3 80 => E2 B2 81
 | ||
|         2CC1;COPTIC SMALL LETTER SAMPI;Ll;0;L;;;;;N;;;2CC0;;2CC0
 | ||
|         ...
 | ||
|         2CE2;COPTIC CAPITAL LETTER OLD NUBIAN WAU;Lu;0;L;;;;;N;;;;2CE3; E2 B3 A2 => E2 B3 A3
 | ||
|         2CE3;COPTIC SMALL LETTER OLD NUBIAN WAU;Ll;0;L;;;;;N;;;2CE2;;2CE2 <=
 | ||
|         }
 | ||
|         else if (c2 = #$B2) then
 | ||
|         begin
 | ||
|           if ord(c3) mod 2 = 0 then new_c3 := chr(ord(c3) + 1);
 | ||
|         end
 | ||
|         else if (c2 = #$B3) and (c3 in [#$80..#$A3]) then
 | ||
|         begin
 | ||
|           if ord(c3) mod 2 = 0 then new_c3 := chr(ord(c3) + 1);
 | ||
|         end;
 | ||
| 
 | ||
|         if (CounterDiff <> 0) then
 | ||
|         begin
 | ||
|           OutStr^ := new_c1;
 | ||
|           OutStr[1] := new_c2;
 | ||
|           OutStr[2] := new_c3;
 | ||
|         end
 | ||
|         else
 | ||
|         begin
 | ||
|           if c1 <> new_c1 then OutStr^ := new_c1;
 | ||
|           if c2 <> new_c2 then OutStr[1] := new_c2;
 | ||
|           if c3 <> new_c3 then OutStr[2] := new_c3;
 | ||
|         end;
 | ||
| 
 | ||
|         inc(InStr, 3);
 | ||
|         inc(OutStr, 3);
 | ||
|       end;
 | ||
|       {
 | ||
|       FF21;FULLWIDTH LATIN CAPITAL LETTER A;Lu;0;L;<wide> 0041;;;;N;;;;FF41; EF BC A1 => EF BD 81
 | ||
|       ...
 | ||
|       FF3A;FULLWIDTH LATIN CAPITAL LETTER Z;Lu;0;L;<wide> 005A;;;;N;;;;FF5A; EF BC BA => EF BD 9A
 | ||
|       }
 | ||
|       #$EF:
 | ||
|       begin
 | ||
|         c2 := InStr[1];
 | ||
|         c3 := InStr[2];
 | ||
| 
 | ||
|         if (c2 = #$BC) and (c3 in [#$A1..#$BA]) then
 | ||
|         begin
 | ||
|           OutStr^ := c1;
 | ||
|           OutStr[1] := #$BD;
 | ||
|           OutStr[2] := chr(ord(c3) - $20);
 | ||
|         end;
 | ||
| 
 | ||
|         if (CounterDiff <> 0) then
 | ||
|         begin
 | ||
|           OutStr^ := c1;
 | ||
|           OutStr[1] := c2;
 | ||
|           OutStr[2] := c3;
 | ||
|         end;
 | ||
| 
 | ||
|         inc(InStr, 3);
 | ||
|         inc(OutStr, 3);
 | ||
|       end;
 | ||
|     else
 | ||
|       // Copy the character if the string was disaligned by previous changes
 | ||
|       if (CounterDiff <> 0) then OutStr^:= c1;
 | ||
|       inc(InStr);
 | ||
|       inc(OutStr);
 | ||
|     end; // Case InStr^
 | ||
|   end; // while
 | ||
| 
 | ||
|   // Final correction of the buffer size
 | ||
|   SetLength(Result,OutStr - PChar(Result));
 | ||
| end;
 | ||
| 
 | ||
| {
 | ||
|   AInStr - The input string
 | ||
|   ALanguage - The language. Use '' for maximum speed if one desires to ignore the language
 | ||
|               The language should be specified in the format from ISO 639-1,
 | ||
|               which uses 2 characters to represent each language.
 | ||
|               If the language has no code in ISO 639-1, then the 3-chars code
 | ||
|               from ISO 639-2 should be used.
 | ||
|               Example: "tr" - Turkish language locale
 | ||
| 
 | ||
|   Data from here: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
 | ||
| 
 | ||
|   The columns in the file UnicodeData.txt are explained here:
 | ||
|   http://www.ksu.ru/eng/departments/ktk/test/perl/lib/unicode/UCDFF301.html#Case Mappings
 | ||
| }
 | ||
| function UTF8UpperCase(const AInStr: string; ALanguage: string=''): string;
 | ||
| var
 | ||
|   i, InCounter, OutCounter: PtrInt;
 | ||
|   OutStr: PChar;
 | ||
|   CharLen: integer;
 | ||
|   CharProcessed: Boolean;
 | ||
|   NewCharLen: integer;
 | ||
|   NewChar, OldChar: Word;
 | ||
|   // Language identification
 | ||
|   IsTurkish: Boolean;
 | ||
| 
 | ||
|   procedure CorrectOutStrSize(AOldCharSize, ANewCharSize: Integer);
 | ||
|   begin
 | ||
|     if not (ANewCharSize > AOldCharSize) then Exit; // no correction needed
 | ||
|     if (ANewCharSize > 20) or (AOldCharSize > 20) then Exit; // sanity check
 | ||
|     // Fix for bug 23428
 | ||
|     // If the string wasn't decreased by previous char changes,
 | ||
|     // and our current operation will make it bigger, then for safety
 | ||
|     // increase the buffer
 | ||
|     if (ANewCharSize > AOldCharSize) and (OutCounter >= InCounter-1) then
 | ||
|     begin
 | ||
|       SetLength(Result, Length(Result)+ANewCharSize-AOldCharSize);
 | ||
|       OutStr := PChar(Result);
 | ||
|     end;
 | ||
|   end;
 | ||
| 
 | ||
| begin
 | ||
|   // Start with the same string, and progressively modify
 | ||
|   Result:=AInStr;
 | ||
|   UniqueString(Result);
 | ||
|   OutStr := PChar(Result);
 | ||
| 
 | ||
|   // Language identification
 | ||
|   IsTurkish := (ALanguage = 'tr') or (ALanguage = 'az'); // Turkish and Azeri have a special handling
 | ||
| 
 | ||
|   InCounter:=1; // for AInStr
 | ||
|   OutCounter := 0; // for Result
 | ||
|   while InCounter<=length(AInStr) do
 | ||
|   begin
 | ||
|     { First ASCII chars }
 | ||
|     if (AInStr[InCounter] <= 'z') and (AInStr[InCounter] >= 'a') then
 | ||
|     begin
 | ||
|       // Special turkish handling
 | ||
|       // small dotted i to capital dotted i
 | ||
|       if IsTurkish and (AInStr[InCounter] = 'i') then
 | ||
|       begin
 | ||
|         SetLength(Result,Length(Result)+1);// Increase the buffer
 | ||
|         OutStr := PChar(Result);
 | ||
|         OutStr[OutCounter]:=#$C4;
 | ||
|         OutStr[OutCounter+1]:=#$B0;
 | ||
|         inc(InCounter);
 | ||
|         inc(OutCounter,2);
 | ||
|       end
 | ||
|       else
 | ||
|       begin
 | ||
|         OutStr[OutCounter]:=chr(ord(AInStr[InCounter])-32);
 | ||
|         inc(InCounter);
 | ||
|         inc(OutCounter);
 | ||
|       end;
 | ||
|     end
 | ||
|     { Now everything else }
 | ||
|     else
 | ||
|     begin
 | ||
|       CharLen := UTF8CharacterLength(@AInStr[InCounter]);
 | ||
|       CharProcessed := False;
 | ||
|       NewCharLen := CharLen;
 | ||
| 
 | ||
|       if CharLen = 2 then
 | ||
|       begin
 | ||
|         OldChar := (Ord(AInStr[InCounter]) shl 8) or Ord(AInStr[InCounter+1]);
 | ||
|         NewChar := 0;
 | ||
| 
 | ||
|         // Major processing
 | ||
|         case OldChar of
 | ||
|         // Latin Characters 0000–0FFF http://en.wikibooks.org/wiki/Unicode/Character_reference/0000-0FFF
 | ||
|         $C39F:        NewChar := $5353; // ß => SS
 | ||
|         $C3A0..$C3B6,$C3B8..$C3BE: NewChar := OldChar - $20;
 | ||
|         $C3BF:        NewChar := $C5B8; // ÿ
 | ||
|         $C481..$C4B0: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
 | ||
|         // 0130 = C4 B0
 | ||
|         // turkish small undotted i to capital undotted i
 | ||
|         $C4B1:
 | ||
|         begin
 | ||
|           OutStr[OutCounter]:='I';
 | ||
|           NewCharLen := 1;
 | ||
|           CharProcessed := True;
 | ||
|         end;
 | ||
|         $C4B2..$C4B7: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
 | ||
|         // $C4B8: ĸ without upper/lower
 | ||
|         $C4B9..$C4BF: if OldChar mod 2 = 0 then NewChar := OldChar - 1;
 | ||
|         $C580: NewChar := $C4BF; // border between bytes
 | ||
|         $C581..$C588: if OldChar mod 2 = 0 then NewChar := OldChar - 1;
 | ||
|         // $C589 ʼn => ?
 | ||
|         $C58A..$C5B7: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
 | ||
|         // $C5B8: // Ÿ already uppercase
 | ||
|         $C5B9..$C5BE: if OldChar mod 2 = 0 then NewChar := OldChar - 1;
 | ||
|         $C5BF: // 017F
 | ||
|         begin
 | ||
|           OutStr[OutCounter]:='S';
 | ||
|           NewCharLen := 1;
 | ||
|           CharProcessed := True;
 | ||
|         end;
 | ||
|         // 0180 = C6 80 -> A convoluted part
 | ||
|         $C680: NewChar := $C983;
 | ||
|         $C682..$C685: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
 | ||
|         $C688: NewChar := $C687;
 | ||
|         $C68C: NewChar := $C68B;
 | ||
|         // 0190 = C6 90 -> A convoluted part
 | ||
|         $C692: NewChar := $C691;
 | ||
|         $C695: NewChar := $C7B6;
 | ||
|         $C699: NewChar := $C698;
 | ||
|         $C69A: NewChar := $C8BD;
 | ||
|         $C69E: NewChar := $C8A0;
 | ||
|         // 01A0 = C6 A0 -> A convoluted part
 | ||
|         $C6A0..$C6A5: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
 | ||
|         $C6A8: NewChar := $C6A7;
 | ||
|         $C6AD: NewChar := $C6AC;
 | ||
|         // 01B0 = C6 B0
 | ||
|         $C6B0: NewChar := $C6AF;
 | ||
|         $C6B3..$C6B6: if OldChar mod 2 = 0 then NewChar := OldChar - 1;
 | ||
|         $C6B9: NewChar := $C6B8;
 | ||
|         $C6BD: NewChar := $C6BC;
 | ||
|         $C6BF: NewChar := $C7B7;
 | ||
|         // 01C0 = C7 80
 | ||
|         $C784..$C786: NewChar := $C784;
 | ||
|         $C787..$C789: NewChar := $C787;
 | ||
|         $C78A..$C78C: NewChar := $C78A;
 | ||
|         $C78E: NewChar := $C78D;
 | ||
|         // 01D0 = C7 90
 | ||
|         $C790: NewChar := $C78F;
 | ||
|         $C791..$C79C: if OldChar mod 2 = 0 then NewChar := OldChar - 1;
 | ||
|         $C79D: NewChar := $C68E;
 | ||
|         $C79F: NewChar := $C79E;
 | ||
|         // 01E0 = C7 A0
 | ||
|         $C7A0..$C7AF: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
 | ||
|         // 01F0 = C7 B0
 | ||
|         $C7B2..$C7B3: NewChar := $C7B1;
 | ||
|         $C7B5: NewChar := $C7B4;
 | ||
|         $C7B8..$C7BF: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
 | ||
|         // 0200 = C8 80
 | ||
|         // 0210 = C8 90
 | ||
|         $C880..$C89F: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
 | ||
|         // 0220 = C8 A0
 | ||
|         // 0230 = C8 B0
 | ||
|         $C8A2..$C8B3: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
 | ||
|         $C8BC: NewChar := $C8BB;
 | ||
|         $C8BF:
 | ||
|         begin
 | ||
|           CorrectOutStrSize(2, 3);
 | ||
|           OutStr[OutCounter]  := #$E2;
 | ||
|           OutStr[OutCounter+1]:= #$B1;
 | ||
|           OutStr[OutCounter+2]:= #$BE;
 | ||
|           NewCharLen := 3;
 | ||
|           CharProcessed := True;
 | ||
|         end;
 | ||
|         // 0240 = C9 80
 | ||
|         $C980:
 | ||
|         begin
 | ||
|           CorrectOutStrSize(2, 3);
 | ||
|           OutStr[OutCounter]  := #$E2;
 | ||
|           OutStr[OutCounter+1]:= #$B1;
 | ||
|           OutStr[OutCounter+2]:= #$BF;
 | ||
|           NewCharLen := 3;
 | ||
|           CharProcessed := True;
 | ||
|         end;
 | ||
|         $C982: NewChar := $C981;
 | ||
|         $C986..$C98F: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
 | ||
|         // 0250 = C9 90
 | ||
|         $C990:
 | ||
|         begin
 | ||
|           CorrectOutStrSize(2, 3);
 | ||
|           OutStr[OutCounter]  := #$E2;
 | ||
|           OutStr[OutCounter+1]:= #$B1;
 | ||
|           OutStr[OutCounter+2]:= #$AF;
 | ||
|           NewCharLen := 3;
 | ||
|           CharProcessed := True;
 | ||
|         end;
 | ||
|         $C991:
 | ||
|         begin
 | ||
|           CorrectOutStrSize(2, 3);
 | ||
|           OutStr[OutCounter]  := #$E2;
 | ||
|           OutStr[OutCounter+1]:= #$B1;
 | ||
|           OutStr[OutCounter+2]:= #$AD;
 | ||
|           NewCharLen := 3;
 | ||
|           CharProcessed := True;
 | ||
|         end;
 | ||
|         $C992:
 | ||
|         begin
 | ||
|           CorrectOutStrSize(2, 3);
 | ||
|           OutStr[OutCounter]  := #$E2;
 | ||
|           OutStr[OutCounter+1]:= #$B1;
 | ||
|           OutStr[OutCounter+2]:= #$B0;
 | ||
|           NewCharLen := 3;
 | ||
|           CharProcessed := True;
 | ||
|         end;
 | ||
|         $C993: NewChar := $C681;
 | ||
|         $C994: NewChar := $C686;
 | ||
|         $C996: NewChar := $C689;
 | ||
|         $C997: NewChar := $C68A;
 | ||
|         $C999: NewChar := $C68F;
 | ||
|         $C99B: NewChar := $C690;
 | ||
|         // 0260 = C9 A0
 | ||
|         $C9A0: NewChar := $C693;
 | ||
|         $C9A3: NewChar := $C694;
 | ||
|         $C9A5:
 | ||
|         begin
 | ||
|           CorrectOutStrSize(2, 3);
 | ||
|           OutStr[OutCounter]  := #$EA;
 | ||
|           OutStr[OutCounter+1]:= #$9E;
 | ||
|           OutStr[OutCounter+2]:= #$8D;
 | ||
|           NewCharLen := 3;
 | ||
|           CharProcessed := True;
 | ||
|         end;
 | ||
|         $C9A8: NewChar := $C697;
 | ||
|         $C9A9: NewChar := $C696;
 | ||
|         $C9AB:
 | ||
|         begin
 | ||
|           CorrectOutStrSize(2, 3);
 | ||
|           OutStr[OutCounter]  := #$E2;
 | ||
|           OutStr[OutCounter+1]:= #$B1;
 | ||
|           OutStr[OutCounter+2]:= #$A2;
 | ||
|           NewCharLen := 3;
 | ||
|           CharProcessed := True;
 | ||
|         end;
 | ||
|         $C9AF: NewChar := $C69C;
 | ||
|         // 0270 = C9 B0
 | ||
|         $C9B1:
 | ||
|         begin
 | ||
|           CorrectOutStrSize(2, 3);
 | ||
|           OutStr[OutCounter]  := #$E2;
 | ||
|           OutStr[OutCounter+1]:= #$B1;
 | ||
|           OutStr[OutCounter+2]:= #$AE;
 | ||
|           NewCharLen := 3;
 | ||
|           CharProcessed := True;
 | ||
|         end;
 | ||
|         $C9B2: NewChar := $C69D;
 | ||
|         $C9B5: NewChar := $C69F;
 | ||
|         $C9BD:
 | ||
|         begin
 | ||
|           CorrectOutStrSize(2, 3);
 | ||
|           OutStr[OutCounter]  := #$E2;
 | ||
|           OutStr[OutCounter+1]:= #$B1;
 | ||
|           OutStr[OutCounter+2]:= #$A4;
 | ||
|           NewCharLen := 3;
 | ||
|           CharProcessed := True;
 | ||
|         end;
 | ||
|         // 0280 = CA 80
 | ||
|         $CA80: NewChar := $C6A6;
 | ||
|         $CA83: NewChar := $C6A9;
 | ||
|         $CA88: NewChar := $C6AE;
 | ||
|         $CA89: NewChar := $C984;
 | ||
|         $CA8A: NewChar := $C6B1;
 | ||
|         $CA8B: NewChar := $C6B2;
 | ||
|         $CA8C: NewChar := $C985;
 | ||
|         // 0290 = CA 90
 | ||
|         $CA92: NewChar := $C6B7;
 | ||
|         {
 | ||
|         03A0 = CE A0
 | ||
| 
 | ||
|         03AC;GREEK SMALL LETTER ALPHA WITH TONOS;Ll;0;L;03B1 0301;;;;N;GREEK SMALL LETTER ALPHA TONOS;;0386;;0386
 | ||
|         03AD;GREEK SMALL LETTER EPSILON WITH TONOS;Ll;0;L;03B5 0301;;;;N;GREEK SMALL LETTER EPSILON TONOS;;0388;;0388
 | ||
|         03AE;GREEK SMALL LETTER ETA WITH TONOS;Ll;0;L;03B7 0301;;;;N;GREEK SMALL LETTER ETA TONOS;;0389;;0389
 | ||
|         03AF;GREEK SMALL LETTER IOTA WITH TONOS;Ll;0;L;03B9 0301;;;;N;GREEK SMALL LETTER IOTA TONOS;;038A;;038A
 | ||
|         }
 | ||
|         $CEAC: NewChar := $CE86;
 | ||
|         $CEAD: NewChar := $CE88;
 | ||
|         $CEAE: NewChar := $CE89;
 | ||
|         $CEAF: NewChar := $CE8A;
 | ||
|         {
 | ||
|         03B0 = CE B0
 | ||
| 
 | ||
|         03B0;GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS;Ll;0;L;03CB 0301;;;;N;GREEK SMALL LETTER UPSILON DIAERESIS TONOS;;;;
 | ||
|         03B1;GREEK SMALL LETTER ALPHA;Ll;0;L;;;;;N;;;0391;;0391
 | ||
|         ...
 | ||
|         03BF;GREEK SMALL LETTER OMICRON;Ll;0;L;;;;;N;;;039F;;039F
 | ||
|         }
 | ||
|         $CEB1..$CEBF: NewChar := OldChar - $20; // Greek Characters
 | ||
|         {
 | ||
|         03C0 = CF 80
 | ||
| 
 | ||
|         03C0;GREEK SMALL LETTER PI;Ll;0;L;;;;;N;;;03A0;;03A0 CF 80 => CE A0
 | ||
|         03C1;GREEK SMALL LETTER RHO;Ll;0;L;;;;;N;;;03A1;;03A1
 | ||
|         03C2;GREEK SMALL LETTER FINAL SIGMA;Ll;0;L;;;;;N;;;03A3;;03A3
 | ||
|         03C3;GREEK SMALL LETTER SIGMA;Ll;0;L;;;;;N;;;03A3;;03A3
 | ||
|         03C4;GREEK SMALL LETTER TAU;Ll;0;L;;;;;N;;;03A4;;03A4
 | ||
|         ....
 | ||
|         03CB;GREEK SMALL LETTER UPSILON WITH DIALYTIKA;Ll;0;L;03C5 0308;;;;N;GREEK SMALL LETTER UPSILON DIAERESIS;;03AB;;03AB
 | ||
|         03CC;GREEK SMALL LETTER OMICRON WITH TONOS;Ll;0;L;03BF 0301;;;;N;GREEK SMALL LETTER OMICRON TONOS;;038C;;038C
 | ||
|         03CD;GREEK SMALL LETTER UPSILON WITH TONOS;Ll;0;L;03C5 0301;;;;N;GREEK SMALL LETTER UPSILON TONOS;;038E;;038E
 | ||
|         03CE;GREEK SMALL LETTER OMEGA WITH TONOS;Ll;0;L;03C9 0301;;;;N;GREEK SMALL LETTER OMEGA TONOS;;038F;;038F
 | ||
|         03CF;GREEK CAPITAL KAI SYMBOL;Lu;0;L;;;;;N;;;;03D7;
 | ||
|         }
 | ||
|         $CF80,$CF81,$CF83..$CF8B: NewChar := OldChar - $E0; // Greek Characters
 | ||
|         $CF82: NewChar := $CEA3;
 | ||
|         $CF8C: NewChar := $CE8C;
 | ||
|         $CF8D: NewChar := $CE8E;
 | ||
|         $CF8E: NewChar := $CE8F;
 | ||
|         {
 | ||
|         03D0 = CF 90
 | ||
| 
 | ||
|         03D0;GREEK BETA SYMBOL;Ll;0;L;<compat> 03B2;;;;N;GREEK SMALL LETTER CURLED BETA;;0392;;0392 CF 90 => CE 92
 | ||
|         03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398 => CE 98
 | ||
|         03D5;GREEK PHI SYMBOL;Ll;0;L;<compat> 03C6;;;;N;GREEK SMALL LETTER SCRIPT PHI;;03A6;;03A6 => CE A6
 | ||
|         03D6;GREEK PI SYMBOL;Ll;0;L;<compat> 03C0;;;;N;GREEK SMALL LETTER OMEGA PI;;03A0;;03A0 => CE A0
 | ||
|         03D7;GREEK KAI SYMBOL;Ll;0;L;;;;;N;;;03CF;;03CF => CF 8F
 | ||
|         03D9;GREEK SMALL LETTER ARCHAIC KOPPA;Ll;0;L;;;;;N;;;03D8;;03D8
 | ||
|         03DB;GREEK SMALL LETTER STIGMA;Ll;0;L;;;;;N;;;03DA;;03DA
 | ||
|         03DD;GREEK SMALL LETTER DIGAMMA;Ll;0;L;;;;;N;;;03DC;;03DC
 | ||
|         03DF;GREEK SMALL LETTER KOPPA;Ll;0;L;;;;;N;;;03DE;;03DE
 | ||
|         }
 | ||
|         $CF90: NewChar := $CE92;
 | ||
|         $CF91: NewChar := $CE98;
 | ||
|         $CF95: NewChar := $CEA6;
 | ||
|         $CF96: NewChar := $CEA0;
 | ||
|         $CF97: NewChar := $CF8F;
 | ||
|         $CF99..$CF9F: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
 | ||
|         // 03E0 = CF A0
 | ||
|         $CFA0..$CFAF: if OldChar mod 2 = 1 then NewChar := OldChar - 1;
 | ||
|         {
 | ||
|         03F0 = CF B0
 | ||
| 
 | ||
|         03F0;GREEK KAPPA SYMBOL;Ll;0;L;<compat> 03BA;;;;N;GREEK SMALL LETTER SCRIPT KAPPA;;039A;;039A => CE 9A
 | ||
|         03F1;GREEK RHO SYMBOL;Ll;0;L;<compat> 03C1;;;;N;GREEK SMALL LETTER TAILED RHO;;03A1;;03A1 => CE A1
 | ||
|         03F2;GREEK LUNATE SIGMA SYMBOL;Ll;0;L;<compat> 03C2;;;;N;GREEK SMALL LETTER LUNATE SIGMA;;03F9;;03F9
 | ||
|         03F5;GREEK LUNATE EPSILON SYMBOL;Ll;0;L;<compat> 03B5;;;;N;;;0395;;0395 => CE 95
 | ||
|         03F8;GREEK SMALL LETTER SHO;Ll;0;L;;;;;N;;;03F7;;03F7
 | ||
|         03FB;GREEK SMALL LETTER SAN;Ll;0;L;;;;;N;;;03FA;;03FA
 | ||
|         }
 | ||
|         $CFB0: NewChar := $CE9A;
 | ||
|         $CFB1: NewChar := $CEA1;
 | ||
|         $CFB2: NewChar := $CFB9;
 | ||
|         $CFB5: NewChar := $CE95;
 | ||
|         $CFB8: NewChar := $CFB7;
 | ||
|         $CFBB: NewChar := $CFBA;
 | ||
|         // 0400 = D0 80 ... 042F everything already uppercase
 | ||
|         // 0430 = D0 B0
 | ||
|         $D0B0..$D0BF: NewChar := OldChar - $20; // Cyrillic alphabet
 | ||
|         // 0440 = D1 80
 | ||
|         $D180..$D18F: NewChar := OldChar - $E0; // Cyrillic alphabet
 | ||
|         // 0450 = D1 90
 | ||
|         $D190..$D19F: NewChar := OldChar - $110; // Cyrillic alphabet
 | ||
|         end;
 | ||
| 
 | ||
|         if NewChar <> 0 then
 | ||
|         begin
 | ||
|           OutStr[OutCounter]  := Chr(Hi(NewChar));
 | ||
|           OutStr[OutCounter+1]:= Chr(Lo(NewChar));
 | ||
|           CharProcessed := True;
 | ||
|         end;
 | ||
|       end;
 | ||
| 
 | ||
|       // Copy the character if the string was disaligned by previous changed
 | ||
|       // and no processing was done in this character
 | ||
|       if (InCounter <> OutCounter+1) and (not CharProcessed) then
 | ||
|       begin
 | ||
|         for i := 0 to CharLen-1 do
 | ||
|           OutStr[OutCounter+i]  :=AInStr[InCounter+i];
 | ||
|       end;
 | ||
| 
 | ||
|       inc(InCounter, CharLen);
 | ||
|       inc(OutCounter, NewCharLen);
 | ||
|     end;
 | ||
|   end; // while
 | ||
| 
 | ||
|   // Final correction of the buffer size
 | ||
|   SetLength(Result,OutCounter);
 | ||
| end;
 | ||
| 
 | ||
| function FindInvalidUTF8Character(p: PChar; Count: PtrInt;
 | ||
|   StopOnNonASCII: Boolean): PtrInt;
 | ||
| // return -1 if ok
 | ||
| var
 | ||
|   CharLen: Integer;
 | ||
|   c: Char;
 | ||
| begin
 | ||
|   if (p<>nil) then begin
 | ||
|     Result:=0;
 | ||
|     while Result<Count do begin
 | ||
|       c:=p^;
 | ||
|       if ord(c)<128 then begin
 | ||
|         // regular single byte ASCII character (#0 is a character, this is pascal ;)
 | ||
|         CharLen:=1;
 | ||
|       end
 | ||
|       else if ord(c)<%11000000 then begin
 | ||
|         // regular single byte character
 | ||
|         if StopOnNonASCII then
 | ||
|           exit;
 | ||
|         CharLen:=1;
 | ||
|       end
 | ||
|       else if ((ord(c) and %11100000) = %11000000) then begin
 | ||
|         // could be 2 byte character
 | ||
|         if (Result<Count-1) and ((ord(p[1]) and %11000000) = %10000000) then
 | ||
|           CharLen:=2
 | ||
|         else
 | ||
|           exit; // missing following bytes
 | ||
|       end
 | ||
|       else if ((ord(c) and %11110000) = %11100000) then begin
 | ||
|         // could be 3 byte character
 | ||
|         if (Result<Count-2) and ((ord(p[1]) and %11000000) = %10000000)
 | ||
|         and ((ord(p[2]) and %11000000) = %10000000) then
 | ||
|           CharLen:=3
 | ||
|         else
 | ||
|           exit; // missing following bytes
 | ||
|       end
 | ||
|       else if ((ord(c) and %11111000) = %11110000) then begin
 | ||
|         // could be 4 byte character
 | ||
|         if (Result<Count-3) and ((ord(p[1]) and %11000000) = %10000000)
 | ||
|         and ((ord(p[2]) and %11000000) = %10000000)
 | ||
|         and ((ord(p[3]) and %11000000) = %10000000) then
 | ||
|           CharLen:=4
 | ||
|         else
 | ||
|           exit; // missing following bytes
 | ||
|       end
 | ||
|       else begin
 | ||
|         if StopOnNonASCII then
 | ||
|           exit;
 | ||
|         CharLen:=1;
 | ||
|       end;
 | ||
|       inc(Result,CharLen);
 | ||
|       inc(p,CharLen);
 | ||
|       if Result>Count then begin
 | ||
|         dec(Result,CharLen);
 | ||
|         exit; // missing following bytes
 | ||
|       end;
 | ||
|     end;
 | ||
|   end;
 | ||
|   // ok
 | ||
|   Result:=-1;
 | ||
| end;
 | ||
| 
 | ||
| function ValidUTF8String(const s: String): String;
 | ||
| var
 | ||
|   p, cur: PChar;
 | ||
|   l, lr: integer;
 | ||
|   NeedFree: Boolean;
 | ||
| begin
 | ||
|   if FindInvalidUTF8Character(PChar(s), Length(s)) <> -1 then
 | ||
|   begin
 | ||
|     NeedFree := True;
 | ||
|     GetMem(p, Length(s) + 1);
 | ||
|     StrPCopy(p, s);
 | ||
|     UTF8FixBroken(p);
 | ||
|   end
 | ||
|   else
 | ||
|   begin
 | ||
|     p := PChar(s);
 | ||
|     NeedFree := False;
 | ||
|   end;
 | ||
| 
 | ||
|   Result := '';
 | ||
|   cur := p;
 | ||
|   while cur^ <> #0 do
 | ||
|   begin
 | ||
|     l := UTF8CharacterLength(cur);
 | ||
|     if (l = 1) and (cur^ < #32) then
 | ||
|       Result := Result + '#' + IntToStr(Ord(cur^))
 | ||
|     else
 | ||
|     begin
 | ||
|       lr := Length(Result);
 | ||
|       SetLength(Result, lr + l);
 | ||
|       System.Move(cur^, Result[lr + 1], l);
 | ||
|     end;
 | ||
|     inc(cur, l)
 | ||
|   end;
 | ||
| 
 | ||
|   if NeedFree then
 | ||
|     FreeMem(p);
 | ||
| end;
 | ||
| 
 | ||
| function UTF8Trim(const s: string; Flags: TUTF8TrimFlags): string;
 | ||
| var
 | ||
|   p: PChar;
 | ||
|   u: Cardinal;
 | ||
|   StartP: PtrUInt;
 | ||
|   l: Integer;
 | ||
|   KeepAllNonASCII: boolean;
 | ||
| begin
 | ||
|   Result:=s;
 | ||
|   if Result='' then exit;
 | ||
|   KeepAllNonASCII:=[u8tKeepControlCodes,u8tKeepNoBreakSpaces]*Flags=[u8tKeepControlCodes,u8tKeepNoBreakSpaces];
 | ||
|   if not (u8tKeepStart in Flags) then begin
 | ||
|     // trim start
 | ||
|     p:=PChar(Result);
 | ||
|     repeat
 | ||
|       l:=1;
 | ||
|       case p^ of
 | ||
|       #0:
 | ||
|         if p-PChar(Result)=length(Result) then
 | ||
|         begin
 | ||
|           // everything was trimmed
 | ||
|           exit('')
 | ||
|         end else if u8tKeepControlCodes in Flags then
 | ||
|           break;
 | ||
|       ' ': ;
 | ||
|       #10,#13:
 | ||
|         if u8tKeepLineBreaks in Flags then
 | ||
|           break;
 | ||
|       #9:
 | ||
|         if u8tKeepTabs in Flags then
 | ||
|           break;
 | ||
|       #1..#8,#11,#12,#14..#31,#127:
 | ||
|         if u8tKeepControlCodes in Flags then
 | ||
|           break;
 | ||
|       #128..#255:
 | ||
|         begin
 | ||
|           if KeepAllNonASCII then break;
 | ||
|           u:=UTF8CharacterToUnicode(p,l);
 | ||
|           if (l<=1) then break; // invalid character
 | ||
|           case u of
 | ||
|           128..159, // C1 set of control codes
 | ||
|           8206, 8207: // left-to-right, right-to-left mark
 | ||
|             if u8tKeepControlCodes in Flags then break;
 | ||
|           160,   // no break space
 | ||
|           $2007, // figure space
 | ||
|           $2026, // narrow no-break space
 | ||
|           $FEFF: // zero with no-break space
 | ||
|             if u8tKeepNoBreakSpaces in Flags then break;
 | ||
|           else
 | ||
|             break;
 | ||
|           end;
 | ||
|         end;
 | ||
|       else
 | ||
|         break;
 | ||
|       end;
 | ||
|       inc(p,l);
 | ||
|     until false;
 | ||
|     if p>PChar(Result) then begin
 | ||
|       Result:=copy(Result,p-PChar(Result)+1,length(Result));
 | ||
|       if Result='' then exit;
 | ||
|     end;
 | ||
|   end;
 | ||
| 
 | ||
|   if not (u8tKeepEnd in Flags) then begin
 | ||
|     // trim end
 | ||
|     p:=@Result[length(Result)];
 | ||
|     repeat
 | ||
|       case p^ of
 | ||
|       #0:
 | ||
|         if u8tKeepControlCodes in Flags then
 | ||
|           break;
 | ||
|       ' ': ;
 | ||
|       #10,#13:
 | ||
|         if u8tKeepLineBreaks in Flags then
 | ||
|           break;
 | ||
|       #9:
 | ||
|         if u8tKeepTabs in Flags then
 | ||
|           break;
 | ||
|       #1..#8,#11,#12,#14..#31,#127:
 | ||
|         if u8tKeepControlCodes in Flags then
 | ||
|           break;
 | ||
|       #128..#255:
 | ||
|         begin
 | ||
|           if KeepAllNonASCII then break;
 | ||
|           StartP:=UTF8FindNearestCharStart(PChar(Result),length(Result),p-PChar(Result));
 | ||
|           u:=UTF8CharacterToUnicode(PChar(Result)+StartP,l);
 | ||
|           if (l<=1) then break; // invalid character
 | ||
|           case u of
 | ||
|           128..159, // C1 set of control codes
 | ||
|           8206, 8207: // left-to-right, right-to-left mark
 | ||
|             if u8tKeepControlCodes in Flags then break;
 | ||
|           160,   // no break space
 | ||
|           $2007, // figure space
 | ||
|           $2026, // narrow no-break space
 | ||
|           $FEFF: // zero with no-break space
 | ||
|             if u8tKeepNoBreakSpaces in Flags then break;
 | ||
|           else
 | ||
|             break;
 | ||
|           end;
 | ||
|           p:=PChar(Result)+StartP;
 | ||
|         end;
 | ||
|       else
 | ||
|         break;
 | ||
|       end;
 | ||
|       dec(p);
 | ||
|     until p<PChar(Result);
 | ||
|     // p is on last good byte
 | ||
|     SetLength(Result,p+1-PChar(Result));
 | ||
|   end;
 | ||
| end;
 | ||
| 
 | ||
| procedure AssignUTF8ListToAnsi(UTF8List, AnsiList: TStrings);
 | ||
| var
 | ||
|   i: Integer;
 | ||
| begin
 | ||
|   AnsiList.Clear;
 | ||
|   if UTF8List=nil then exit;
 | ||
|   for i:=0 to UTF8List.Count-1 do
 | ||
|     AnsiList.Add(UTF8ToSys(UTF8List[i]));
 | ||
| end;
 | ||
| 
 | ||
| {------------------------------------------------------------------------------
 | ||
|   Name:    UTF8CompareStr
 | ||
|   Params: S1, S2 - UTF8 encoded strings
 | ||
|   Returns: < 0 if S1 < S2, 0 if S1 = S2, > 0 if S2 > S1.
 | ||
|   Compare 2 UTF8 encoded strings, case sensitive.
 | ||
|  ------------------------------------------------------------------------------}
 | ||
| function UTF8CompareStr(const S1, S2: string): Integer;
 | ||
| begin
 | ||
|   Result := UTF8CompareStr(PChar(Pointer(S1)),length(S1),
 | ||
|                             PChar(Pointer(S2)),length(S2));
 | ||
| end;
 | ||
| 
 | ||
| function UTF8CompareStrP(S1, S2: PChar): Integer;
 | ||
| begin
 | ||
|   Result:=UTF8CompareStr(S1,StrLen(S1),S2,StrLen(S2));
 | ||
| end;
 | ||
| 
 | ||
| function UTF8CompareStr(S1: PChar; Count1: SizeInt; S2: PChar; Count2: SizeInt
 | ||
|   ): Integer;
 | ||
| var
 | ||
|   Count: SizeInt;
 | ||
| begin
 | ||
|   Result := 0;
 | ||
|   if Count1>Count2 then
 | ||
|     Count:=Count2
 | ||
|   else
 | ||
|     Count:=Count1;
 | ||
|   Result := CompareMemRange(Pointer(S1),Pointer(S2), Count); // Note: CompareMemRange can handle nil if Count=0
 | ||
|   if Result<>0 then exit;
 | ||
|   if Count1>Count2 then
 | ||
|     Result:=1
 | ||
|   else if Count1<Count2 then
 | ||
|     Result:=-1
 | ||
|   else
 | ||
|     Result:=0;
 | ||
| end;
 | ||
| 
 | ||
| {------------------------------------------------------------------------------
 | ||
|   Name:    UTF8CompareText
 | ||
|   Params: S1, S2 - UTF8 encoded strings
 | ||
|   Returns: < 0 if S1 < S2, 0 if S1 = S2, > 0 if S2 > S1.
 | ||
|   Compare 2 UTF8 encoded strings, case insensitive.
 | ||
|   Note: Use this function instead of AnsiCompareText.
 | ||
|   This function guarantees proper collation on all supported platforms.
 | ||
|  ------------------------------------------------------------------------------}
 | ||
| function UTF8CompareText(const S1, S2: string): Integer;
 | ||
| var
 | ||
|   S1Lower, S2Lower: string;
 | ||
| begin
 | ||
|   S1Lower := UTF8LowerCase(S1);
 | ||
|   S2Lower := UTF8LowerCase(S2);
 | ||
|   Result := UTF8CompareStr(S1Lower, S2Lower);
 | ||
| end;
 | ||
| 
 | ||
| function UTF8CompareStrCollated(const S1, S2: string): Integer;
 | ||
| begin
 | ||
|   {$IFDEF MSWINDOWS}
 | ||
|     Result := AnsiCompareStr(UTF8ToSys(S1), UTF8ToSys(S2));
 | ||
|   {$ELSE}
 | ||
|     Result := WideCompareStr(WideString(S1),WideString(S2));
 | ||
|   {$ENDIF}
 | ||
| end;
 | ||
| 
 | ||
| function CompareStrListUTF8LowerCase(List: TStringList; Index1, Index2: Integer
 | ||
|   ): Integer;
 | ||
| begin
 | ||
|   Result:=UTF8CompareText(List[Index1],List[Index2]);
 | ||
| end;
 | ||
| 
 | ||
| {------------------------------------------------------------------------------
 | ||
|   Name:    ConvertUTF8ToUTF16
 | ||
|   Params:  Dest                - Pointer to destination string
 | ||
|            DestWideCharCount   - Wide char count allocated in destination string
 | ||
|            Src                 - Pointer to source string
 | ||
|            SrcCharCount        - Char count allocated in source string
 | ||
|            Options             - Conversion options, if none is set, both
 | ||
|              invalid and unfinished source chars are skipped
 | ||
| 
 | ||
|              toInvalidCharError       - Stop on invalid source char and report
 | ||
|                                       error
 | ||
|              toInvalidCharToSymbol    - Replace invalid source chars with '?'
 | ||
|              toUnfinishedCharError    - Stop on unfinished source char and
 | ||
|                                       report error
 | ||
|              toUnfinishedCharToSymbol - Replace unfinished source char with '?'
 | ||
| 
 | ||
|            ActualWideCharCount - Actual wide char count converted from source
 | ||
|                                string to destination string
 | ||
|   Returns:
 | ||
|     trNoError        - The string was successfully converted without
 | ||
|                      any error
 | ||
|     trNullSrc        - Pointer to source string is nil
 | ||
|     trNullDest       - Pointer to destination string is nil
 | ||
|     trDestExhausted  - Destination buffer size is not big enough to hold
 | ||
|                      converted string
 | ||
|     trInvalidChar    - Invalid source char has occured
 | ||
|     trUnfinishedChar - Unfinished source char has occured
 | ||
| 
 | ||
|   Converts the specified UTF-8 encoded string to UTF-16 encoded (system endian)
 | ||
|  ------------------------------------------------------------------------------}
 | ||
| function ConvertUTF8ToUTF16(Dest: PWideChar; DestWideCharCount: SizeUInt;
 | ||
|   Src: PChar; SrcCharCount: SizeUInt; Options: TConvertOptions;
 | ||
|   out ActualWideCharCount: SizeUInt): TConvertResult;
 | ||
| var
 | ||
|   DestI, SrcI: SizeUInt;
 | ||
|   B1, B2, B3, B4: Byte;
 | ||
|   W: Word;
 | ||
|   C: Cardinal;
 | ||
| 
 | ||
|   function UnfinishedCharError: Boolean;
 | ||
|   begin
 | ||
|     if toUnfinishedCharToSymbol in Options then
 | ||
|     begin
 | ||
|       Dest[DestI] := System.WideChar('?');
 | ||
|       Inc(DestI);
 | ||
|       Result := False;
 | ||
|     end
 | ||
|     else
 | ||
|       if toUnfinishedCharError in Options then
 | ||
|       begin
 | ||
|         ConvertUTF8ToUTF16 := trUnfinishedChar;
 | ||
|         Result := True;
 | ||
|       end
 | ||
|       else Result := False;
 | ||
|   end;
 | ||
| 
 | ||
|   function InvalidCharError(Count: SizeUInt): Boolean; inline;
 | ||
|   begin
 | ||
|     if not (toInvalidCharError in Options) then
 | ||
|     begin
 | ||
|       if toInvalidCharToSymbol in Options then
 | ||
|       begin
 | ||
|         Dest[DestI] := System.WideChar('?');
 | ||
|         Inc(DestI);
 | ||
|       end;
 | ||
| 
 | ||
|       Dec(SrcI, Count);
 | ||
| 
 | ||
|       // skip trailing UTF-8 char bytes
 | ||
|       while (Count > 0) do
 | ||
|       begin
 | ||
|         if (Byte(Src[SrcI]) and %11000000) <> %10000000 then Break;
 | ||
|         Inc(SrcI);
 | ||
|         Dec(Count);
 | ||
|       end;
 | ||
| 
 | ||
|       Result := False;
 | ||
|     end
 | ||
|     else
 | ||
|       if toInvalidCharError in Options then
 | ||
|       begin
 | ||
|         ConvertUTF8ToUTF16 := trUnfinishedChar;
 | ||
|         Result := True;
 | ||
|       end;
 | ||
|   end;
 | ||
| 
 | ||
| begin
 | ||
|   ActualWideCharCount := 0;
 | ||
| 
 | ||
|   if not Assigned(Src) then
 | ||
|   begin
 | ||
|     Result := trNullSrc;
 | ||
|     Exit;
 | ||
|   end;
 | ||
| 
 | ||
|   if not Assigned(Dest) then
 | ||
|   begin
 | ||
|     Result := trNullDest;
 | ||
|     Exit;
 | ||
|   end;
 | ||
|   SrcI := 0;
 | ||
|   DestI := 0;
 | ||
| 
 | ||
|   while (DestI < DestWideCharCount) and (SrcI < SrcCharCount) do
 | ||
|   begin
 | ||
|     B1 := Byte(Src[SrcI]);
 | ||
|     Inc(SrcI);
 | ||
| 
 | ||
|     if B1 < 128 then // single byte UTF-8 char
 | ||
|     begin
 | ||
|       Dest[DestI] := System.WideChar(B1);
 | ||
|       Inc(DestI);
 | ||
|     end
 | ||
|     else
 | ||
|     begin
 | ||
|       if SrcI >= SrcCharCount then
 | ||
|         if UnfinishedCharError then Exit(trInvalidChar)
 | ||
|         else Break;
 | ||
| 
 | ||
|       B2 := Byte(Src[SrcI]);
 | ||
|       Inc(SrcI);
 | ||
| 
 | ||
|       if (B1 and %11100000) = %11000000 then // double byte UTF-8 char
 | ||
|       begin
 | ||
|         if (B2 and %11000000) = %10000000 then
 | ||
|         begin
 | ||
|           Dest[DestI] := System.WideChar(((B1 and %00011111) shl 6) or (B2 and %00111111));
 | ||
|           Inc(DestI);
 | ||
|         end
 | ||
|         else // invalid character, assume single byte UTF-8 char
 | ||
|           if InvalidCharError(1) then Exit(trInvalidChar);
 | ||
|       end
 | ||
|       else
 | ||
|       begin
 | ||
|         if SrcI >= SrcCharCount then
 | ||
|           if UnfinishedCharError then Exit(trInvalidChar)
 | ||
|           else Break;
 | ||
| 
 | ||
|         B3 := Byte(Src[SrcI]);
 | ||
|         Inc(SrcI);
 | ||
| 
 | ||
|         if (B1 and %11110000) = %11100000 then // triple byte UTF-8 char
 | ||
|         begin
 | ||
|           if ((B2 and %11000000) = %10000000) and ((B3 and %11000000) = %10000000) then
 | ||
|           begin
 | ||
|             W := ((B1 and %00011111) shl 12) or ((B2 and %00111111) shl 6) or (B3 and %00111111);
 | ||
|             if (W < $D800) or (W > $DFFF) then // to single wide char UTF-16 char
 | ||
|             begin
 | ||
|               Dest[DestI] := System.WideChar(W);
 | ||
|               Inc(DestI);
 | ||
|             end
 | ||
|             else // invalid UTF-16 character, assume double byte UTF-8 char
 | ||
|               if InvalidCharError(2) then Exit(trInvalidChar);
 | ||
|           end
 | ||
|           else // invalid character, assume double byte UTF-8 char
 | ||
|             if InvalidCharError(2) then Exit(trInvalidChar);
 | ||
|         end
 | ||
|         else
 | ||
|         begin
 | ||
|           if SrcI >= SrcCharCount then
 | ||
|             if UnfinishedCharError then Exit(trInvalidChar)
 | ||
|             else Break;
 | ||
| 
 | ||
|           B4 := Byte(Src[SrcI]);
 | ||
|           Inc(SrcI);
 | ||
| 
 | ||
|           if ((B1 and %11111000) = %11110000) and ((B2 and %11000000) = %10000000)
 | ||
|             and ((B3 and %11000000) = %10000000) and ((B4 and %11000000) = %10000000) then
 | ||
|           begin // 4 byte UTF-8 char
 | ||
|             C := ((B1 and %00011111) shl 18) or ((B2 and %00111111) shl 12)
 | ||
|               or ((B3 and %00111111) shl 6)  or (B4 and %00111111);
 | ||
|             // to double wide char UTF-16 char
 | ||
|             Dest[DestI] := System.WideChar($D800 or ((C - $10000) shr 10));
 | ||
|             Inc(DestI);
 | ||
|             if DestI >= DestWideCharCount then Break;
 | ||
|             Dest[DestI] := System.WideChar($DC00 or ((C - $10000) and %0000001111111111));
 | ||
|             Inc(DestI);
 | ||
|           end
 | ||
|           else // invalid character, assume triple byte UTF-8 char
 | ||
|             if InvalidCharError(3) then Exit(trInvalidChar);
 | ||
|         end;
 | ||
|       end;
 | ||
|     end;
 | ||
|   end;
 | ||
| 
 | ||
|   if DestI >= DestWideCharCount then
 | ||
|   begin
 | ||
|     DestI := DestWideCharCount - 1;
 | ||
|     Result := trDestExhausted;
 | ||
|   end
 | ||
|   else
 | ||
|     Result := trNoError;
 | ||
| 
 | ||
|   Dest[DestI] := #0;
 | ||
|   ActualWideCharCount := DestI + 1;
 | ||
| end;
 | ||
| 
 | ||
| {------------------------------------------------------------------------------
 | ||
|   Name:    ConvertUTF16ToUTF8
 | ||
|   Params:  Dest             - Pointer to destination string
 | ||
|            DestCharCount    - Char count allocated in destination string
 | ||
|            Src              - Pointer to source string
 | ||
|            SrcWideCharCount - Wide char count allocated in source string
 | ||
|            Options          - Conversion options, if none is set, both
 | ||
|              invalid and unfinished source chars are skipped.
 | ||
|              See ConvertUTF8ToUTF16 for details.
 | ||
| 
 | ||
|            ActualCharCount  - Actual char count converted from source
 | ||
|                             string to destination string
 | ||
|   Returns: See ConvertUTF8ToUTF16
 | ||
| 
 | ||
|   Converts the specified UTF-16 encoded string (system endian) to UTF-8 encoded
 | ||
|  ------------------------------------------------------------------------------}
 | ||
| function ConvertUTF16ToUTF8(Dest: PChar; DestCharCount: SizeUInt;
 | ||
|   Src: PWideChar; SrcWideCharCount: SizeUInt; Options: TConvertOptions;
 | ||
|   out ActualCharCount: SizeUInt): TConvertResult;
 | ||
| var
 | ||
|   DestI, SrcI: SizeUInt;
 | ||
|   W1, W2: Word;
 | ||
|   C: Cardinal;
 | ||
| 
 | ||
|   function UnfinishedCharError: Boolean;
 | ||
|   begin
 | ||
|     if toUnfinishedCharToSymbol in Options then
 | ||
|     begin
 | ||
|       Dest[DestI] := Char('?');
 | ||
|       Inc(DestI);
 | ||
|       Result := False;
 | ||
|     end
 | ||
|     else
 | ||
|       if toUnfinishedCharError in Options then
 | ||
|       begin
 | ||
|         ConvertUTF16ToUTF8 := trUnfinishedChar;
 | ||
|         Result := True;
 | ||
|       end
 | ||
|       else Result := False;
 | ||
|   end;
 | ||
| 
 | ||
|   function InvalidCharError(Count: SizeUInt): Boolean; inline;
 | ||
|   begin
 | ||
|     if not (toInvalidCharError in Options) then
 | ||
|     begin
 | ||
|       if toInvalidCharToSymbol in Options then
 | ||
|       begin
 | ||
|         Dest[DestI] := Char('?');
 | ||
|         Inc(DestI);
 | ||
|       end;
 | ||
| 
 | ||
|       Dec(SrcI, Count);
 | ||
|       // skip trailing UTF-16 wide char
 | ||
|       if (Word(Src[SrcI]) and $FC00) = $DC00 then Inc(SrcI);
 | ||
| 
 | ||
|       Result := False;
 | ||
|     end
 | ||
|     else
 | ||
|       if toInvalidCharError in Options then
 | ||
|       begin
 | ||
|         ConvertUTF16ToUTF8 := trUnfinishedChar;
 | ||
|         Result := True;
 | ||
|       end;
 | ||
|   end;
 | ||
| 
 | ||
| begin
 | ||
|   ActualCharCount := 0;
 | ||
| 
 | ||
|   if not Assigned(Src) then
 | ||
|   begin
 | ||
|     Result := trNullSrc;
 | ||
|     Exit;
 | ||
|   end;
 | ||
| 
 | ||
|   if not Assigned(Dest) then
 | ||
|   begin
 | ||
|     Result := trNullDest;
 | ||
|     Exit;
 | ||
|   end;
 | ||
|   SrcI := 0;
 | ||
|   DestI := 0;
 | ||
| 
 | ||
|   while (DestI < DestCharCount) and (SrcI < SrcWideCharCount) do
 | ||
|   begin
 | ||
|     W1 := Word(Src[SrcI]);
 | ||
|     Inc(SrcI);
 | ||
| 
 | ||
|     if (W1 < $D800) or (W1 > $DFFF) then // single wide char UTF-16 char
 | ||
|     begin
 | ||
|       if W1 < $0080 then // to single byte UTF-8 char
 | ||
|       begin
 | ||
|         Dest[DestI] := Char(W1);
 | ||
|         Inc(DestI);
 | ||
|       end
 | ||
|       else
 | ||
|         if W1 < $0800 then // to double byte UTF-8 char
 | ||
|         begin
 | ||
|           Dest[DestI] := Char(%11000000 or ((W1 and %11111000000) shr 6));
 | ||
|           Inc(DestI);
 | ||
|           if DestI >= DestCharCount then Break;
 | ||
|           Dest[DestI] := Char(%10000000 or (W1 and %111111));
 | ||
|           Inc(DestI);
 | ||
|         end
 | ||
|         else
 | ||
|         begin // to triple byte UTF-8 char
 | ||
|           Dest[DestI] := Char(%11100000 or ((W1 and %1111000000000000) shr 12));
 | ||
|           Inc(DestI);
 | ||
|           if DestI >= DestCharCount then Break;
 | ||
|           Dest[DestI] := Char(%10000000 or ((W1 and %111111000000) shr 6));
 | ||
|           Inc(DestI);
 | ||
|           if DestI >= DestCharCount then Break;
 | ||
|           Dest[DestI] := Char(%10000000 or (W1 and %111111));
 | ||
|           Inc(DestI);
 | ||
|         end;
 | ||
|     end
 | ||
|     else
 | ||
|     begin
 | ||
|       if SrcI >= SrcWideCharCount then
 | ||
|         if UnfinishedCharError then Exit(trInvalidChar)
 | ||
|         else Break;
 | ||
| 
 | ||
|       W2 := Word(Src[SrcI]);
 | ||
|       Inc(SrcI);
 | ||
| 
 | ||
|       if (W1 and $F800) = $D800 then // double wide char UTF-16 char
 | ||
|       begin
 | ||
|         if (W2 and $FC00) = $DC00 then
 | ||
|         begin
 | ||
|           C := (W1 - $D800) shl 10 + (W2 - $DC00) + $10000;
 | ||
| 
 | ||
|           // to 4 byte UTF-8 char
 | ||
|           Dest[DestI] := Char(%11110000 or (C shr 18));
 | ||
|           Inc(DestI);
 | ||
|           if DestI >= DestCharCount then Break;
 | ||
|           Dest[DestI] := Char(%10000000 or ((C and $3F000) shr 12));
 | ||
|           Inc(DestI);
 | ||
|           if DestI >= DestCharCount then Break;
 | ||
|           Dest[DestI] := Char(%10000000 or ((C and %111111000000) shr 6));
 | ||
|           Inc(DestI);
 | ||
|           if DestI >= DestCharCount then Break;
 | ||
|           Dest[DestI] := Char(%10000000 or (C and %111111));
 | ||
|           Inc(DestI);
 | ||
|         end
 | ||
|         else // invalid character, assume single wide char UTF-16 char
 | ||
|           if InvalidCharError(1) then Exit(trInvalidChar);
 | ||
|       end
 | ||
|       else // invalid character, assume single wide char UTF-16 char
 | ||
|         if InvalidCharError(1) then Exit(trInvalidChar);
 | ||
|     end;
 | ||
|   end;
 | ||
| 
 | ||
|   if DestI >= DestCharCount then
 | ||
|   begin
 | ||
|     DestI := DestCharCount - 1;
 | ||
|     Result := trDestExhausted;
 | ||
|   end
 | ||
|   else
 | ||
|     Result := trNoError;
 | ||
| 
 | ||
|   Dest[DestI] := #0;
 | ||
|   ActualCharCount := DestI + 1;
 | ||
| end;
 | ||
| 
 | ||
| {------------------------------------------------------------------------------
 | ||
|   Name:    UTF8ToUTF16
 | ||
|   Params:  S - Source UTF-8 string
 | ||
|   Returns: UTF-16 encoded string
 | ||
| 
 | ||
|   Converts the specified UTF-8 encoded string to UTF-16 encoded (system endian)
 | ||
|   Avoid copying the result string since on windows a widestring requires a full
 | ||
|   copy
 | ||
|  ------------------------------------------------------------------------------}
 | ||
| function UTF8ToUTF16(const S: AnsiString): UnicodeString;
 | ||
| var
 | ||
|   L: SizeUInt;
 | ||
| begin
 | ||
|   if S = ''
 | ||
|   then begin
 | ||
|     Result := '';
 | ||
|     Exit;
 | ||
|   end;
 | ||
| 
 | ||
|   SetLength(Result, Length(S));
 | ||
|   // wide chars of UTF-16 <= bytes of UTF-8 string
 | ||
|   if ConvertUTF8ToUTF16(PWideChar(Result), Length(Result) + 1, PChar(S), Length(S),
 | ||
|     [toInvalidCharToSymbol], L) = trNoError
 | ||
|   then SetLength(Result, L - 1)
 | ||
|   else Result := '';
 | ||
| end;
 | ||
| 
 | ||
| {------------------------------------------------------------------------------
 | ||
|   Name:    UTF16ToUTF8
 | ||
|   Params:  S - Source UTF-16 string (system endian)
 | ||
|   Returns: UTF-8 encoded string
 | ||
| 
 | ||
|   Converts the specified UTF-16 encoded string (system endian) to UTF-8 encoded
 | ||
|  ------------------------------------------------------------------------------}
 | ||
| function UTF16ToUTF8(const S: UnicodeString): AnsiString;
 | ||
| var
 | ||
|   L: SizeUInt;
 | ||
|   R: AnsiString;
 | ||
| begin
 | ||
|   Result := '';
 | ||
|   if S = '' then Exit;
 | ||
| 
 | ||
|   SetLength(R, Length(S) * 3);
 | ||
|   // bytes of UTF-8 <= 3 * wide chars of UTF-16 string
 | ||
|   // e.g. %11100000 10100000 10000000 (UTF-8) is $0800 (UTF-16)
 | ||
|   if ConvertUTF16ToUTF8(PChar(R), Length(R) + 1, PWideChar(S), Length(S),
 | ||
|     [toInvalidCharToSymbol], L) = trNoError then
 | ||
|   begin
 | ||
|     SetLength(R, L - 1);
 | ||
|     Result := R;
 | ||
|   end;
 | ||
| end;
 | ||
| 
 | ||
| procedure LazGetLanguageIDs(var Lang, FallbackLang: String);
 | ||
| 
 | ||
|   {$IFDEF DARWIN}
 | ||
|   function GetLanguage: boolean;
 | ||
|   var
 | ||
|     Ref: CFStringRef;
 | ||
|     LangArray: CFMutableArrayRef;
 | ||
|     StrSize: CFIndex;
 | ||
|     StrRange: CFRange;
 | ||
|     Locals: CFArrayRef;
 | ||
|     Bundle: CFBundleRef;
 | ||
|   begin
 | ||
|     Result := false;
 | ||
|     Bundle:=CFBundleGetMainBundle;
 | ||
|     if Bundle=nil then exit;
 | ||
|     Locals:=CFBundleCopyBundleLocalizations(Bundle);
 | ||
|     if Locals=nil then exit;
 | ||
|     LangArray := CFBundleCopyLocalizationsForPreferences(Locals, nil);
 | ||
|     try
 | ||
|       if CFArrayGetCount(LangArray) > 0 then
 | ||
|       begin
 | ||
|         Ref := CFArrayGetValueAtIndex(LangArray, 0);
 | ||
|         StrRange.location := 0;
 | ||
|         StrRange.length := CFStringGetLength(Ref);
 | ||
| 
 | ||
|         StrSize:=0;
 | ||
|         CFStringGetBytes(Ref, StrRange, kCFStringEncodingUTF8,
 | ||
|           Ord('?'), False, nil, 0, StrSize);
 | ||
|         SetLength(Lang, StrSize);
 | ||
| 
 | ||
|         if StrSize > 0 then
 | ||
|         begin
 | ||
|           CFStringGetBytes(Ref, StrRange, kCFStringEncodingUTF8,
 | ||
|             Ord('?'), False, @Lang[1], StrSize, StrSize);
 | ||
|           Result:=true;
 | ||
|           FallbackLang := Copy(Lang, 1, 2);
 | ||
|         end;
 | ||
|       end;
 | ||
|     finally
 | ||
|       CFRelease(LangArray);
 | ||
|       CFRelease(Locals);
 | ||
|     end;
 | ||
|   end;
 | ||
|   {$ENDIF}
 | ||
| begin
 | ||
| {$IFDEF DARWIN}
 | ||
|   if not GetLanguage then
 | ||
|     GetLanguageIDs(Lang, FallbackLang);
 | ||
| {$ELSE}
 | ||
|   GetLanguageIDs(Lang, FallbackLang);
 | ||
| {$ENDIF}
 | ||
| end;
 | ||
| 
 | ||
| {
 | ||
| This routine will strip country information from the language ID
 | ||
| making it more simple
 | ||
| 
 | ||
| Ideally the resulting ID from here should conform to ISO 639-1
 | ||
| or ISO 639-2, if the language has no code in ISO 639-1
 | ||
| }
 | ||
| procedure LazGetShortLanguageID(var Lang: String);
 | ||
| var
 | ||
|   FallbackLang: String;
 | ||
| begin
 | ||
|   FallbackLang:='';
 | ||
|   LazGetLanguageIDs(Lang, FallbackLang);
 | ||
| 
 | ||
|   // Simply making sure its length is at most 2 should be enough for most languages
 | ||
|   if Length(Lang) > 2 then Lang := Lang[1] + Lang[2];
 | ||
| end;
 | ||
| 
 | ||
| procedure InitFPUpchars;
 | ||
| var
 | ||
|   c: Char;
 | ||
| begin
 | ||
|   for c:=Low(char) to High(char) do begin
 | ||
|     FPUpChars[c]:=upcase(c);
 | ||
|   end;
 | ||
| end;
 | ||
| 
 | ||
| initialization
 | ||
| begin
 | ||
|   InitFPUpchars;
 | ||
|   InitLazUtf8;
 | ||
| end;
 | ||
| 
 | ||
| end.
 | ||
| 
 | 
