{ ***************************************************************************** * * * This file is part of the Lazarus Component Library (LCL) * * * * See the file COPYING.modifiedLGPL.txt, included in this distribution, * * for details about the copyright. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * * ***************************************************************************** } unit fpsconvencoding; {$mode objfpc}{$H+} //{$IFDEF WINDOWS} //{$WARNING Windows/Wine/ReactOS locale conversion is not fully supported yet. Sorry.} //{$ENDIF} interface //{$IFNDEF DisableIconv} //{$IFDEF UNIX}{$IF not defined(VER2_2_0) and not defined(VER2_2_2)}{$DEFINE HasIconvEnc}{$ENDIF}{$ENDIF} //{$ENDIF} uses SysUtils, Classes, dos {$IFDEF HasIconvEnc},iconvenc{$ENDIF}; const EncodingUTF8 = 'utf8'; EncodingAnsi = 'ansi'; EncodingUTF8BOM = 'utf8bom'; // UTF-8 with byte order mark EncodingUCS2LE = 'ucs2le'; // UCS 2 byte little endian EncodingUCS2BE = 'ucs2le'; // UCS 2 byte big endian type TConvertEncodingFunction = function(const s: string): string; TCharToUTF8Table = array[char] of PChar; TUnicodeToCharID = function(Unicode: cardinal): integer; var ConvertAnsiToUTF8: TConvertEncodingFunction = nil; ConvertUTF8ToAnsi: TConvertEncodingFunction = nil; function SingleByteToUTF8(const s: string; const Table: TCharToUTF8Table): string; function UTF8BOMToUTF8(const s: string): string; // UTF8 with BOM function ISO_8859_1ToUTF8(const s: string): string; // central europe function UTF8ToSingleByte(const s: string; const UTF8CharConvFunc: TUnicodeToCharID): string; function UTF8ToISO_8859_1(const s: string): string; // central europe function UTF8CharacterToUnicode(p: PChar; out CharLen: integer): Cardinal; implementation const ArrayISO_8859_1ToUTF8: TCharToUTF8Table = ( #0, // #0 #1, // #1 #2, // #2 #3, // #3 #4, // #4 #5, // #5 #6, // #6 #7, // #7 #8, // #8 #9, // #9 #10, // #10 #11, // #11 #12, // #12 #13, // #13 #14, // #14 #15, // #15 #16, // #16 #17, // #17 #18, // #18 #19, // #19 #20, // #20 #21, // #21 #22, // #22 #23, // #23 #24, // #24 #25, // #25 #26, // #26 #27, // #27 #28, // #28 #29, // #29 #30, // #30 #31, // #31 ' ', // ' ' '!', // '!' '"', // '"' '#', // '#' '$', // '$' '%', // '%' '&', // '&' '''', // '''' '(', // '(' ')', // ')' '*', // '*' '+', // '+' ',', // ',' '-', // '-' '.', // '.' '/', // '/' '0', // '0' '1', // '1' '2', // '2' '3', // '3' '4', // '4' '5', // '5' '6', // '6' '7', // '7' '8', // '8' '9', // '9' ':', // ':' ';', // ';' '<', // '<' '=', // '=' '>', // '>' '?', // '?' '@', // '@' 'A', // 'A' 'B', // 'B' 'C', // 'C' 'D', // 'D' 'E', // 'E' 'F', // 'F' 'G', // 'G' 'H', // 'H' 'I', // 'I' 'J', // 'J' 'K', // 'K' 'L', // 'L' 'M', // 'M' 'N', // 'N' 'O', // 'O' 'P', // 'P' 'Q', // 'Q' 'R', // 'R' 'S', // 'S' 'T', // 'T' 'U', // 'U' 'V', // 'V' 'W', // 'W' 'X', // 'X' 'Y', // 'Y' 'Z', // 'Z' '[', // '[' '\', // '\' ']', // ']' '^', // '^' '_', // '_' '`', // '`' 'a', // 'a' 'b', // 'b' 'c', // 'c' 'd', // 'd' 'e', // 'e' 'f', // 'f' 'g', // 'g' 'h', // 'h' 'i', // 'i' 'j', // 'j' 'k', // 'k' 'l', // 'l' 'm', // 'm' 'n', // 'n' 'o', // 'o' 'p', // 'p' 'q', // 'q' 'r', // 'r' 's', // 's' 't', // 't' 'u', // 'u' 'v', // 'v' 'w', // 'w' 'x', // 'x' 'y', // 'y' 'z', // 'z' '{', // '{' '|', // '|' '}', // '}' '~', // '~' #127, // #127 #194#128, // #128 #194#129, // #129 #194#130, // #130 #194#131, // #131 #194#132, // #132 #194#133, // #133 #194#134, // #134 #194#135, // #135 #194#136, // #136 #194#137, // #137 #194#138, // #138 #194#139, // #139 #194#140, // #140 #194#141, // #141 #194#142, // #142 #194#143, // #143 #194#144, // #144 #194#145, // #145 #194#146, // #146 #194#147, // #147 #194#148, // #148 #194#149, // #149 #194#150, // #150 #194#151, // #151 #194#152, // #152 #194#153, // #153 #194#154, // #154 #194#155, // #155 #194#156, // #156 #194#157, // #157 #194#158, // #158 #194#159, // #159 #194#160, // #160 #194#161, // #161 #194#162, // #162 #194#163, // #163 #194#164, // #164 #194#165, // #165 #194#166, // #166 #194#167, // #167 #194#168, // #168 #194#169, // #169 #194#170, // #170 #194#171, // #171 #194#172, // #172 #194#173, // #173 #194#174, // #174 #194#175, // #175 #194#176, // #176 #194#177, // #177 #194#178, // #178 #194#179, // #179 #194#180, // #180 #194#181, // #181 #194#182, // #182 #194#183, // #183 #194#184, // #184 #194#185, // #185 #194#186, // #186 #194#187, // #187 #194#188, // #188 #194#189, // #189 #194#190, // #190 #194#191, // #191 #195#128, // #192 #195#129, // #193 #195#130, // #194 #195#131, // #195 #195#132, // #196 #195#133, // #197 #195#134, // #198 #195#135, // #199 #195#136, // #200 #195#137, // #201 #195#138, // #202 #195#139, // #203 #195#140, // #204 #195#141, // #205 #195#142, // #206 #195#143, // #207 #195#144, // #208 #195#145, // #209 #195#146, // #210 #195#147, // #211 #195#148, // #212 #195#149, // #213 #195#150, // #214 #195#151, // #215 #195#152, // #216 #195#153, // #217 #195#154, // #218 #195#155, // #219 #195#156, // #220 #195#157, // #221 #195#158, // #222 #195#159, // #223 #195#160, // #224 #195#161, // #225 #195#162, // #226 #195#163, // #227 #195#164, // #228 #195#165, // #229 #195#166, // #230 #195#167, // #231 #195#168, // #232 #195#169, // #233 #195#170, // #234 #195#171, // #235 #195#172, // #236 #195#173, // #237 #195#174, // #238 #195#175, // #239 #195#176, // #240 #195#177, // #241 #195#178, // #242 #195#179, // #243 #195#180, // #244 #195#181, // #245 #195#182, // #246 #195#183, // #247 #195#184, // #248 #195#185, // #249 #195#186, // #250 #195#187, // #251 #195#188, // #252 #195#189, // #253 #195#190, // #254 #195#191 // #255 ); function UTF8BOMToUTF8(const s: string): string; begin Result:=copy(s,4,length(s)); end; function ISO_8859_1ToUTF8(const s: string): string; begin Result:=SingleByteToUTF8(s,ArrayISO_8859_1ToUTF8); end; function SingleByteToUTF8(const s: string; const Table: TCharToUTF8Table ): string; var len: Integer; i: Integer; Src: PChar; Dest: PChar; p: PChar; c: Char; begin if s='' then begin Result:=s; exit; end; len:=length(s); SetLength(Result,len*4);// UTF-8 is at most 4 bytes Src:=PChar(s); Dest:=PChar(Result); for i:=1 to len do begin c:=Src^; inc(Src); if ord(c)<128 then begin Dest^:=c; inc(Dest); end else begin p:=Table[c]; if p<>nil then begin while p^<>#0 do begin Dest^:=p^; inc(p); inc(Dest); end; end; end; end; SetLength(Result,PtrUInt(Dest)-PtrUInt(Result)); end; function UnicodeToISO_8859_1(Unicode: cardinal): integer; begin case Unicode of 0..255: Result:=Unicode; else Result:=-1; end; end; function UTF8ToUTF8BOM(const s: string): string; begin Result:=#$EF#$BB#$BF+s; end; function UTF8ToISO_8859_1(const s: string): string; begin Result:=UTF8ToSingleByte(s,@UnicodeToISO_8859_1); end; function UTF8ToSingleByte(const s: string; const UTF8CharConvFunc: TUnicodeToCharID): string; var len: Integer; Src: PChar; Dest: PChar; c: Char; Unicode: LongWord; CharLen: integer; i: integer; begin if s='' then begin Result:=''; exit; end; len:=length(s); SetLength(Result,len); Src:=PChar(s); Dest:=PChar(Result); while len>0 do begin c:=Src^; if c<#128 then begin Dest^:=c; inc(Dest); inc(Src); dec(len); end else begin Unicode:=UTF8CharacterToUnicode(Src,CharLen); inc(Src,CharLen); dec(len,CharLen); i:=UTF8CharConvFunc(Unicode); if i>=0 then begin Dest^:=chr(i); inc(Dest); end; end; end; SetLength(Result,Dest-PChar(Result)); end; procedure GetSupportedEncodings(List: TStrings); begin List.Add('UTF-8'); List.Add('UTF-8BOM'); List.Add('Ansi'); List.Add('CP1250'); List.Add('CP1251'); List.Add('CP1252'); List.Add('CP1253'); List.Add('CP1254'); List.Add('CP1255'); List.Add('CP1256'); List.Add('CP1257'); List.Add('CP1258'); List.Add('CP866'); List.Add('CP874'); List.Add('ISO-8859-1'); List.Add('KOI-8'); List.Add('UCS-2LE'); List.Add('UCS-2BE'); end; function UTF8CharacterToUnicode(p: PChar; out CharLen: integer): Cardinal; begin if p<>nil then begin if ord(p^)<%11000000 then begin // regular single byte character (#0 is a normal char, this is pascal ;) Result:=ord(p^); CharLen:=1; end else if ((ord(p^) and %11100000) = %11000000) then begin // could be double byte character if (ord(p[1]) and %11000000) = %10000000 then begin Result:=((ord(p^) and %00011111) shl 6) or (ord(p[1]) and %00111111); CharLen:=2; end else begin Result:=ord(p^); CharLen:=1; end; end else if ((ord(p^) and %11110000) = %11100000) then begin // could be triple byte character if ((ord(p[1]) and %11000000) = %10000000) and ((ord(p[2]) and %11000000) = %10000000) then begin Result:=((ord(p^) and %00011111) shl 12) or ((ord(p[1]) and %00111111) shl 6) or (ord(p[2]) and %00111111); CharLen:=3; end else begin Result:=ord(p^); CharLen:=1; end; end else if ((ord(p^) and %11111000) = %11110000) then begin // could be 4 byte character if ((ord(p[1]) and %11000000) = %10000000) and ((ord(p[2]) and %11000000) = %10000000) and ((ord(p[3]) and %11000000) = %10000000) then begin Result:=((ord(p^) and %00001111) shl 18) or ((ord(p[1]) and %00111111) shl 12) or ((ord(p[2]) and %00111111) shl 6) or (ord(p[3]) and %00111111); CharLen:=4; end else begin Result:=ord(p^); CharLen:=1; end; end else begin // invalid character Result:=ord(p^); CharLen:=1; end; end else begin Result:=0; CharLen:=0; end; end; end.