Converter: change the character encoding of source files to UTF-8

git-svn-id: trunk@31023 -
This commit is contained in:
juha 2011-06-03 16:19:19 +00:00
parent dfe60001a1
commit ba43bbf08f
3 changed files with 12 additions and 115 deletions

View File

@ -452,9 +452,6 @@ begin
end;
function TConvertDelphiUnit.CopyAndLoadFile: TModalResult;
var
// CurEncoding: String;
Changed: Boolean;
begin
IDEMessagesWindow.AddMsg(Format(lisConvDelphiConvertingFile,
[fOrigUnitFilename]), '', -1);
@ -475,12 +472,11 @@ begin
[lbfCheckIfText,lbfUpdateFromDisk],true);
if Result<>mrOk then exit;
// Change encoding to UTF-8
{ CurEncoding:=GuessEncoding(fPascalBuffer.Source); //fPascalBuffer.DiskEncoding;
if CurEncoding<>EncodingUTF8 then begin
fPascalBuffer.Source:=ConvertEncoding(fPascalBuffer.Source, CurEncoding, EncodingUTF8);
fPascalBuffer.DiskEncoding:=EncodingUTF8;
fPascalBuffer.MemEncoding:=EncodingUTF8;
end; }
if fPascalBuffer.DiskEncoding<>EncodingUTF8 then begin
IDEMessagesWindow.AddMsg(Format(lisConvDelphiChangedEncodingToUTF8,
[fPascalBuffer.DiskEncoding]), '', -1);
fPascalBuffer.DiskEncoding:=EncodingUTF8; // Takes effect when buffer is saved.
end;
// Create a shared link for codetools.
Assert(fCTLink=Nil, 'fCTLink should be Nil in CopyAndLoadFile');
fCTLink:=TCodeToolLink.Create(fPascalBuffer);
@ -499,8 +495,6 @@ var
LfmFilename: string; // Lazarus .LFM file name.
DFMConverter: TDFMConverter;
TempLFMBuffer: TCodeBuffer;
// CurEncoding: String;
Changed: Boolean;
begin
Result:=mrOK;
fLFMBuffer:=nil;
@ -538,23 +532,13 @@ begin
DFMConverter.Free;
end;
// Change encoding to UTF-8
if fSettings.FixEncoding then begin
Result:=LoadCodeBuffer(TempLFMBuffer,LfmFilename,
[lbfCheckIfText,lbfUpdateFromDisk],true);
// Note: EnUnicode is meant to be a temporary solution.
// LCL has other functions for char encoding.
TempLFMBuffer.Source:=EnUnicode(TempLFMBuffer.Source, Changed);
if Changed then
IDEMessagesWindow.AddMsg(lisConvDelphiChangedEncodingToUTF8, '', -1);
// TempLFMBuffer.SaveToFile(ChangeFileExt(TempLFMBuffer.Filename, '_utf8.lfm'));
Result:=LoadCodeBuffer(TempLFMBuffer,LfmFilename,
[lbfCheckIfText,lbfUpdateFromDisk],true);
if TempLFMBuffer.DiskEncoding<>EncodingUTF8 then begin
IDEMessagesWindow.AddMsg(Format(lisConvDelphiChangedEncodingToUTF8,
[TempLFMBuffer.DiskEncoding]), '', -1);
TempLFMBuffer.DiskEncoding:=EncodingUTF8;
TempLFMBuffer.Save;
{ CurEncoding:=GuessEncoding(TempLFMBuffer.Source);
if CurEncoding<>EncodingUTF8 then begin
ShowMessage('Encoding = ' + CurEncoding);
TempLFMBuffer.Source:=ConvertEncoding(TempLFMBuffer.Source, CurEncoding, EncodingUTF8);
TempLFMBuffer.DiskEncoding:=EncodingUTF8;
TempLFMBuffer.MemEncoding:=EncodingUTF8;
end; }
end;
// Read form file code in.
if not fSettings.SameDfmFile then begin

View File

@ -129,98 +129,11 @@ type
destructor Destroy; override;
end;
function EnUnicode(const TS: UTF8String; var Changed: Boolean): UTF8String;
implementation
{$R *.lfm}
{*******************************************************************************
Function UTFEnc(S:ansiString):WideChar;
S - string like '#1234' or like '1234'.
It process only first 4-5 symbols. (Some kind of protection)
Result - One Unicode symbol.
If S isn't an unicode function will return symbol #0000.
*******************************************************************************}
function UTFEnc(S: ansistring): WideChar;
var X: word; //word - to be sure that it will return Unicode symbol. Not ASCII.
begin
if (S[1]='#')and(Length(S)=5)then
X:=StrToIntDef(Copy(S,2,4),0)
else
X:=StrToIntDef(Copy(S,1,4),0);
Result:=WideChar(X);
end;
{*******************************************************************************
function EnUnicode(TS:UTF8String):UTF8String;
TS:UTF8String - Processing string like <<Caption = #1234#1235':'#1258>>
Function converts it to string like <<Caption = 'АБ:В'>>
It have some troubles with strings that contains several pairs of apostrophes.
(<<Form1.Caption := 'String1 '+'String2'>> will converts into
<<Form1.Caption := 'String1 +String2'>>
*******************************************************************************}
function EnUnicode(const TS: UTF8String; var Changed: Boolean): UTF8String;
var
i,
LPA, //LPA = Left Position of Apostroph. First pos of ' in TS
LPS, //LPS = Left Position of Sharp. First pos of # in TS
RPA, //RPA = Right Position of Apostroph. Last pos of ' in TS
RPS: integer; //RPS = Right Position of Sharp. Last pos of # in TS
insideAp: Boolean;//inside of two Apostrophes.
S,WS: ansistring; //S copying "as is"(for better speed). WS - converts symbol by symbol
begin
Changed:=False;
S:='';
insideAp:=false;
RPS:=0; RPA:=0;
for i:=1 to Length(TS) do begin //find Last positions of ' and #
if TS[i]='#' then
RPS:=i; //May be there is a spec function to do it
if TS[i]='''' then
RPA:=i; //but I didn't find it.
end;
RPS:=RPS+4; //Actually no need for position of #, but pos of last symbol of sequence #1234
i:=1; //Now let's find FIRST pos of ' and #
LPA:=Pos('''',TS); //If ' not found I must throw out processing of
if LPA=0 then
LPA:=Length(TS)+1; //any apostrophes in the TS.
LPS:=Pos('#',TS); //Also for #
if LPS=0 then LPS:=Length(TS)+1;
if (LPA<LPS) AND (LPA<=Length(TS)) then begin //Now I must define position of first
i:=LPA; //symbol either ' or #.
end else if (LPS<LPA) AND (LPS<=Length(TS)) then begin
i:=LPS;
end;
if (RPS<=LPS) OR (RPS<=4) OR (RPS>Length(TS)+4) then
RPS:=0; //More hiding processing if ' or #
if (RPA<=LPA) OR (RPA=0) OR (RPA>Length(TS)) then
RPA:=0; // not found
if (LPA<=Length(TS)) OR (LPS<=Length(TS)) then begin //if we've found ' or # or both - start process
S:=LeftStr(TS,i-1); //first part of TS (before ' or #) copy "as is"
WS:='';
while (i<=Length(TS)) AND ((i<=RPA) OR (i<=RPS)) do begin //process
if TS[i]='''' then begin //current symbol = '
insideAp:=not insideAp; //part inside of '' will be copy as is
end; //even it contains #1234 sequenses
if (not insideAp) AND (TS[i]='#') then begin //part outside of ''
WS:=WS+UTFEnc(Copy(TS,i,5)); //send to UTFEnc
i:=i+4; //skipping nummbers
end else
if TS[i]<>'''' then //skipping apostrophes themselves
WS:=WS+TS[i];
i:=i+1;
end;
//Form1.Memo1.Lines.Add(S+'|'+WS+'|'+Copy(TS,i,Length(TS)));//It was an debug output
S:=S+''''+WS+''''; //adding apostrophes around processed part
S:=S+Copy(TS,i,Length(TS)); //adding rest of string as is
Changed:=True;
end {if (LPA<=Length(TS))OR(LPS<=Length(TS))}
else
S:=TS; //TS doesn't contain neither ' nor #. Copy as is.
Result:=AnsiToUtf8(S); //Result must be an UTF8-string
end;
function IsMissingType(LFMError: TLFMError): boolean;
begin
with LFMError do

View File

@ -5139,7 +5139,7 @@ resourcestring
lisConvDelphiConvertingUnitFiles = '*** Converting unit files ... ***';
lisConvDelphiConvertingFile = '* Converting file %s *';
lisConvDelphiFixingUsedUnits = '* Fixing used units for file %s *';
lisConvDelphiChangedEncodingToUTF8 = 'Changed encoding to UTF-8';
lisConvDelphiChangedEncodingToUTF8 = 'Changed encoding from %s to UTF-8';
lisConvDelphiErrorCanTFindUnit = '%s(%s,%s) Error: Can''t find unit %s';
lisConvDelphiAllSubDirsScanned = 'All sub-directories will be scanned for unit files';
lisConvDelphiMissingIncludeFile = '%s(%s,%s) missing include file';