wikiget: using compatcer format for encoding files

git-svn-id: trunk@47495 -
This commit is contained in:
mattias 2015-01-23 01:14:00 +00:00
parent cdfa4e65c2
commit 18cb6a7867
4 changed files with 165 additions and 75 deletions

View File

@ -323,7 +323,7 @@ var
else if Scheme='image' then begin else if Scheme='image' then begin
URL:=copy(URL,p+1,length(URL)); URL:=copy(URL,p+1,length(URL));
URL:=UTF8Trim(URL); URL:=UTF8Trim(URL);
URL:=WikiInternalLinkToPage(URL); URL:=WikiTitleToPage(URL);
if URL='' then exit; if URL='' then exit;
Filename:=WikiImageToFilename(URL,false,true,true); Filename:=WikiImageToFilename(URL,false,true,true);
FoundImgFile:=FindImage(Filename); FoundImgFile:=FindImage(Filename);
@ -345,7 +345,7 @@ var
end; end;
// convert %hh and remove special characters // convert %hh and remove special characters
URL:=WikiInternalLinkToPage(URL); URL:=WikiTitleToPage(URL);
// check if link to wiki page but with a full baseurl // check if link to wiki page but with a full baseurl
if (Page.WikiPage.BaseURL<>'') if (Page.WikiPage.BaseURL<>'')

View File

@ -22,6 +22,8 @@ unit WikiFormat;
{$mode objfpc}{$H+} {$mode objfpc}{$H+}
{off $DEFINE VerboseWikiFileCode}
interface interface
uses uses
@ -101,8 +103,8 @@ function WikiImageToFilename(Image: string; IsInternalLink, InsertCaseID: boolea
function WikiHeaderToLink(Header: string): string; function WikiHeaderToLink(Header: string): string;
function WikiCreateCommonCodeTagList(AddLazWikiLangs: boolean): TKeyWordFunctionList; function WikiCreateCommonCodeTagList(AddLazWikiLangs: boolean): TKeyWordFunctionList;
function UTF8ToUTF7W(AnUTF8: string): string; function UTF8ToWikiFileCode(AnUTF8: string): string;
function UTF7WToUTF8(anUTF7: string): string; function WikiFileCodeToUTF8(aFileCode: string): string;
procedure TestWikiPageToFilename; procedure TestWikiPageToFilename;
// language // language
@ -356,25 +358,17 @@ begin
end; end;
function WikiPageToFilename(DocumentName: string; IsInternalLink, AppendCaseID: boolean): string; function WikiPageToFilename(DocumentName: string; IsInternalLink, AppendCaseID: boolean): string;
var { IsInternalLink:
i: Integer; AppendCaseID=true: append a string encoding upper/lower case of letters
s: string; }
begin begin
Result:=DocumentName; Result:=DocumentName;
// optional: convert title to wiki link
if IsInternalLink then if IsInternalLink then
Result:=WikiInternalLinkToPage(Result); Result:=WikiTitleToPage(Result);
i:=1; // convert special characaters
while i<=length(Result) do begin Result:=UTF8ToWikiFileCode(Result);
s:=Result[i]; // append case if
case s[1] of
',','-','_','0'..'9','a'..'z','A'..'Z': ;
// Note: UTF-8 characters do not work with svn on OS X
else s:='%'+HexStr(ord(s[1]),2);
end;
if s<>Result[i] then
ReplaceSubstring(Result,i,1,s);
inc(i,length(s));
end;
if AppendCaseID and (Result<>'') then if AppendCaseID and (Result<>'') then
Result:=Result+'.'+WikiPageToCaseID(Result); Result:=Result+'.'+WikiPageToCaseID(Result);
end; end;
@ -383,10 +377,6 @@ function WikiFilenameToPage(Filename: string): string;
var var
Ext: String; Ext: String;
p: Integer; p: Integer;
i: Integer;
Code: Integer;
j: Integer;
c: Char;
begin begin
Result:=ExtractFileName(Filename); Result:=ExtractFileName(Filename);
if Result='' then exit; if Result='' then exit;
@ -400,22 +390,8 @@ begin
dec(p); dec(p);
if (p>=1) and (Result[p]='.') then if (p>=1) and (Result[p]='.') then
Delete(Result,p,length(Result)); Delete(Result,p,length(Result));
// convert non literals // convert special characaters
for i:=length(Result) downto 1 do begin Result:=WikiFileCodeToUTF8(Result);
if Result[i]<>'%' then continue;
Code:=0;
for j:=1 to 2 do begin
if i+j>length(Result) then break;
c:=Result[i+j];
case c of
'0'..'9': Code:=Code*16+ord(c)-ord('0');
'a'..'z': Code:=Code*16+ord(c)-ord('a')+10;
'A'..'Z': Code:=Code*16+ord(c)-ord('A')+10;
else break;
end;
end;
ReplaceSubstring(Result,i,1+j,chr(Code));
end;
end; end;
function WikiImageToFilename(Image: string; function WikiImageToFilename(Image: string;
@ -434,7 +410,7 @@ begin
Delete(Result,1,p); Delete(Result,1,p);
end; end;
if IsInternalLink then if IsInternalLink then
Result:=WikiInternalLinkToPage(Result); Result:=WikiTitleToPage(Result);
Ext:=ExtractFileExt(Result); Ext:=ExtractFileExt(Result);
// encode file name without extension // encode file name without extension
Result:=WikiPageToFilename(copy(Result,1,Length(Result)-length(Ext)),false,false); Result:=WikiPageToFilename(copy(Result,1,Length(Result)-length(Ext)),false,false);
@ -490,11 +466,13 @@ begin
end; end;
const const
UTF7WNormalChars = ['a'..'z','A'..'Z','0'..'9',',','-','_']; WFCAllowedChars = ['a'..'z','A'..'Z','0'..'9',',','!','#','%','(',')','-','_',' '];
function UTF8ToUTF7W(AnUTF8: string): string; function UTF8ToWikiFileCode(AnUTF8: string): string;
{ Keep a..z, A..Z, 0..9, , - _ % = { Keep a..z, A..Z, 0..9, ,!#-%()-_
Replace + with +- Replace + with +-
Rest encode as +base64- Replace = with =-
Replace single invalid byte with =HexHex
Replace sequences of invalid bytes as +base64-
} }
const const
Base64Chars: array[0..63] of char = Base64Chars: array[0..63] of char =
@ -507,20 +485,38 @@ var
c: Char; c: Char;
SrcBits: Integer; SrcBits: Integer;
i: Integer; i: Integer;
AtEnd: Boolean;
begin begin
{$ifdef VerboseWikiFileCode}
writeln('UTF8ToWikiFileCode START AnUTF8="',AnUTF8,'"');
{$endif}
Result:=''; Result:='';
if AnUTF8='' then exit; if AnUTF8='' then exit;
p:=PChar(AnUTF8); p:=PChar(AnUTF8);
repeat repeat
c:=p^; c:=p^;
if (c=#0) and (p-PChar(AnUTF8)=length(AnUTF8)) then break; if (c=#0) and (p-PChar(AnUTF8)=length(AnUTF8)) then break;
if c in UTF7WNormalChars then begin AtEnd:=(p[1]=#0) and (p+1-PChar(AnUTF8)=length(AnUTF8));
if c in WFCAllowedChars then begin
// common English character: keep // common English character: keep
{$ifdef VerboseWikiFileCode}
writeln('UTF8ToWikiFileCode normal char "',c,'"');
{$endif}
Result+=c; Result+=c;
inc(p); inc(p);
end else if (c='+') and (p[1] in UTF7WNormalChars) then begin end else if (c in ['+','=']) and ((p[1] in WFCAllowedChars) or AtEnd) then begin
// replace '+' with '+-' // replace '+' with '+-' and '=' with '=-'
Result+='+-'; {$ifdef VerboseWikiFileCode}
writeln('UTF8ToWikiFileCode single + or = "',c,'"');
{$endif}
Result+=c+'-';
inc(p);
end else if (p[1] in WFCAllowedChars) or AtEnd then begin
// replace single special byte with =HexHex
{$ifdef VerboseWikiFileCode}
writeln('UTF8ToWikiFileCode single special char "',HexStr(ord(c),2),'"');
{$endif}
Result+='='+HexStr(ord(c),2);
inc(p); inc(p);
end else begin end else begin
// special characters -> encode base64 // special characters -> encode base64
@ -532,24 +528,41 @@ begin
repeat repeat
if (p^=#0) and (p-PChar(AnUTF8)=length(AnUTF8)) then if (p^=#0) and (p-PChar(AnUTF8)=length(AnUTF8)) then
break; // end of string break; // end of string
if (p^ in UTF7WNormalChars) and (p[1] in UTF7WNormalChars) if (p^ in WFCAllowedChars) and (p[1] in WFCAllowedChars)
and (p[2] in UTF7WNormalChars) then and (p[2] in WFCAllowedChars) then
break; // the next three are normal characters -> stop encoding as base64 break; // the next three are normal characters -> stop encoding as base64
CharLen:=UTF8CharacterLength(p); CharLen:=UTF8CharacterLength(p);
{$ifdef VerboseWikiFileCode}
writeln('UTF8ToWikiFileCode sequence UTF8CharLen=',CharLen);
{$endif}
for i:=1 to CharLen do begin for i:=1 to CharLen do begin
SrcBits:=ord(p^); SrcBits:=ord(p^);
BufBits:=BufBits or (SrcBits shr (2+BufBitLen)); BufBits:=BufBits or (SrcBits shr (2+BufBitLen));
{$ifdef VerboseWikiFileCode}
writeln('UTF8ToWikiFileCode sequence 6bitA Byte=',i,' SrcBits=',binstr(SrcBits,8),' BufBits=',binstr(BufBits,6),' BufBitLen=',6);
{$endif}
Result+=Base64Chars[BufBits]; Result+=Base64Chars[BufBits];
BufBits:=(SrcBits shl (4-BufBitLen)) and %111111; BufBits:=(SrcBits shl (4-BufBitLen)) and %111111;
BufBitLen:=2+BufBitLen; BufBitLen:=2+BufBitLen;
if BufBitLen=6 then begin if BufBitLen=6 then begin
{$ifdef VerboseWikiFileCode}
writeln('UTF8ToWikiFileCode sequence 6bitB Byte=',i,' SrcBits=',binstr(SrcBits,8),' BufBits=',binstr(BufBits,6),' BufBitLen=',BufBitLen);
{$endif}
Result+=Base64Chars[BufBits]; Result+=Base64Chars[BufBits];
BufBitLen:=0; BufBitLen:=0;
BufBits:=0;
end else begin
{$ifdef VerboseWikiFileCode}
writeln('UTF8ToWikiFileCode sequence <6bit Byte=',i,' SrcBits=',binstr(SrcBits,8),' BufBits=',binstr(BufBits,6),' BufBitLen=',BufBitLen);
{$endif}
end; end;
inc(p); inc(p);
end; end;
until false; until false;
if BufBitLen>0 then begin if BufBitLen>0 then begin
{$ifdef VerboseWikiFileCode}
writeln('UTF8ToWikiFileCode sequence Paddi Byte=',i,' SrcBits=',binstr(SrcBits,8),' BufBits=',binstr(BufBits,6),' BufBitLen=',BufBitLen);
{$endif}
Result+=Base64Chars[BufBits]; Result+=Base64Chars[BufBits];
end; end;
// end marker: '-' // end marker: '-'
@ -558,7 +571,7 @@ begin
until false; until false;
end; end;
function UTF7WToUTF8(anUTF7: string): string; function WikiFileCodeToUTF8(aFileCode: string): string;
var var
p: PChar; p: PChar;
SrcBits: Integer; SrcBits: Integer;
@ -566,19 +579,28 @@ var
BufBitLen: Integer; BufBitLen: Integer;
c: Char; c: Char;
begin begin
{$ifdef VerboseWikiFileCode}
writeln('WikiFileCodeToUTF8 Code="',aFileCode,'"');
{$endif}
Result:=''; Result:='';
if anUTF7='' then exit; if aFileCode='' then exit;
p:=PChar(anUTF7); p:=PChar(aFileCode);
repeat repeat
c:=p^; c:=p^;
if (c=#0) and (p-PChar(anUTF7)=length(anUTF7)) then break; if (c=#0) and (p-PChar(aFileCode)=length(aFileCode)) then break;
if c='+' then begin if c='+' then begin
inc(p); inc(p);
if p^='-' then begin if p^='-' then begin
{$ifdef VerboseWikiFileCode}
writeln('WikiFileCodeToUTF8 +- to +');
{$endif}
inc(p); inc(p);
Result+='+'; // single '+' Result+='+'; // single '+'
end else begin end else begin
// decode base64, read til '-' // decode base64, read til '-'
{$ifdef VerboseWikiFileCode}
writeln('WikiFileCodeToUTF8 base64 sequence');
{$endif}
BufBits:=0; BufBits:=0;
BufBitLen:=0; BufBitLen:=0;
repeat repeat
@ -595,30 +617,86 @@ begin
',': SrcBits:=62; ',': SrcBits:=62;
'_': SrcBits:=63; '_': SrcBits:=63;
else else
raise Exception.Create('invalid UTF7: invalid base64 character'); raise Exception.Create('invalid wiki file code: invalid base64 character');
end; end;
{$ifdef VerboseWikiFileCode}
writeln('WikiFileCodeToUTF8 SrcBits=',binstr(SrcBits,6));
{$endif}
if BufBitLen=0 then begin if BufBitLen=0 then begin
BufBits:=BufBits or (SrcBits shl 2); BufBits:=BufBits or (SrcBits shl 2);
BufBitLen:=6; BufBitLen:=6;
{$ifdef VerboseWikiFileCode}
writeln('WikiFileCodeToUTF8 new byte BufBits=',binstr(BufBits,8),' BufBitLen=',BufBitLen);
{$endif}
end else begin end else begin
BufBits:=BufBits or (SrcBits shr (BufBitLen-2)); BufBits:=BufBits or (SrcBits shr (BufBitLen-2));
{$ifdef VerboseWikiFileCode}
writeln('WikiFileCodeToUTF8 byte complete BufBits=',binstr(BufBits,8),' BufBitLen=',8);
{$endif}
Result+=chr(BufBits); Result+=chr(BufBits);
BufBitLen-=2; BufBitLen-=2;
BufBits:=(SrcBits shl (8-BufBitLen)) and $FF; BufBits:=(SrcBits shl (8-BufBitLen)) and $FF;
{$ifdef VerboseWikiFileCode}
writeln('WikiFileCodeToUTF8 rest byte BufBits=',binstr(BufBits,8),' BufBitLen=',BufBitLen);
{$endif}
end; end;
inc(p); inc(p);
until false; until false;
// Note: BufBitLen can be >0 (the last byte contains padding bits) // Note: BufBitLen can be >0 (the last byte contains padding bits)
{$ifdef VerboseWikiFileCode}
writeln('WikiFileCodeToUTF8 D BufBits=',binstr(BufBits,8),' BufBitLen=',BufBitLen);
{$endif}
if (BufBits shr (8-BufBitLen))>0 then
raise Exception.Create('invalid wiki file code: padding bits not empty');
end; end;
end else if c in UTF7WNormalChars then begin end else if c='=' then begin
inc(p);
if p^='-' then begin
{$ifdef VerboseWikiFileCode}
writeln('WikiFileCodeToUTF8 =- to =');
{$endif}
Result+='=';
inc(p);
end else begin
// one byte as hex code
{$ifdef VerboseWikiFileCode}
writeln('WikiFileCodeToUTF8 =hex ',p^,p[1]);
{$endif}
SrcBits:=0;
case p^ of
'0'..'9': SrcBits:=ord(p^)-ord('0');
'A'..'F': SrcBits:=ord(p^)-ord('A')+10;
'a'..'f': SrcBits:=ord(p^)-ord('a')+10;
else
raise Exception.Create('invalid wiki file code: invalid hex code');
end;
inc(p);
SrcBits:=SrcBits*16;
case p^ of
'0'..'9': SrcBits+=ord(p^)-ord('0');
'A'..'F': SrcBits+=ord(p^)-ord('A')+10;
'a'..'f': SrcBits+=ord(p^)-ord('a')+10;
else
raise Exception.Create('invalid wiki file code: invalid hex code');
end;
inc(p);
{$ifdef VerboseWikiFileCode}
writeln('WikiFileCodeToUTF8 =hex byte=',SrcBits);
{$endif}
Result+=chr(SrcBits);
end;
end else if c in WFCAllowedChars then begin
// normal char // normal char
{$ifdef VerboseWikiFileCode}
writeln('WikiFileCodeToUTF8 normal char "',c,'"');
{$endif}
Result+=c; Result+=c;
inc(p); inc(p);
end else end else
raise Exception.Create('invalid UTF7: invalid character'); raise Exception.Create('invalid wiki file code: invalid character');
until false; until false;
if FindInvalidUTF8Character(PChar(Result),length(Result))>=0 then if FindInvalidUTF8Character(PChar(Result),length(Result))>=0 then
raise Exception.Create('invalid UTF7: result is not UTF-8'); raise Exception.Create('invalid wiki file code: result is not UTF-8');
end; end;
procedure TestWikiPageToFilename; procedure TestWikiPageToFilename;
@ -627,24 +705,24 @@ procedure TestWikiPageToFilename;
var var
Filename: String; Filename: String;
NewPageName: String; NewPageName: String;
ok: Boolean; step: integer;
begin begin
ok:=false; step:=0;
try try
Filename:=WikiPageToFilename(PageName,false,true); Filename:=WikiPageToFilename(PageName,false,true);
NewPageName:=WikiFilenameToPage(Filename); NewPageName:=WikiFilenameToPage(Filename);
if PageName=NewPageName then begin inc(step);
ok:=true; if PageName=NewPageName then
exit; inc(step);
end;
finally finally
if not ok then begin if step<2 then begin
writeln('TestPageToFilename failed:'); writeln('TestPageToFilename failed:');
writeln(' PageName ="',PageName,'"'); writeln(' PageName ="',PageName,'"');
writeln(' NewPageName="',NewPageName,'"'); writeln(' NewPageName="',NewPageName,'"');
writeln(' Filename ="',Filename,'"'); writeln(' Filename ="',Filename,'"');
raise Exception.Create('TestPageToFilename failed');
end; end;
if step=0 then
raise Exception.Create('TestPageToFilename failed');
end; end;
end; end;
@ -658,6 +736,17 @@ begin
t('A/B'); t('A/B');
t('A+B'); t('A+B');
t('A-B'); t('A-B');
t('A=B');
t('A+=B');
t('A+-B');
t('A+');
t('A=');
t('A*');
t('A*$');
t('A*$%');
t('A*$*$');
t('A*$*$*');
t('A*$*$*$');
end; end;
function GetWikiPageLanguage(const DocumentName: string): string; function GetWikiPageLanguage(const DocumentName: string): string;

View File

@ -333,8 +333,8 @@ var
IsWikiTagStartChar, IsWikiTagStartChar,
IsWikiTagChar: array[char] of boolean; IsWikiTagChar: array[char] of boolean;
// normalize link to get the page, e.g. convert spaces to underscores // normalize link to get the page, e.g. convert spaces to underscores, delete #0,$[]{}<>
function WikiInternalLinkToPage(Link: string): string; function WikiTitleToPage(Link: string): string;
function WikiIsExternalLink(Link: string): boolean; function WikiIsExternalLink(Link: string): boolean;
function GetWikiPageID(doc: TDOMNode): string; function GetWikiPageID(doc: TDOMNode): string;
@ -1513,7 +1513,7 @@ begin
end; end;
end; end;
function WikiInternalLinkToPage(Link: string): string; function WikiTitleToPage(Link: string): string;
var var
i: Integer; i: Integer;
j: Integer; j: Integer;

View File

@ -194,7 +194,7 @@ begin
Param:=GetParams(i); Param:=GetParams(i);
//writeln('TWikiGet.DoRun Param="',Param,'"'); //writeln('TWikiGet.DoRun Param="',Param,'"');
if copy(Param,1,length(pPage))=pPage then if copy(Param,1,length(pPage))=pPage then
NeedWikiPage(WikiInternalLinkToPage(copy(Param,length(pPage)+1,length(Param)))); NeedWikiPage(WikiTitleToPage(copy(Param,length(pPage)+1,length(Param))));
end; end;
if (NeedSinglePage) and (FNeededPages.Tree.Count=0) then if (NeedSinglePage) and (FNeededPages.Tree.Count=0) then
E('nothing to do',true); E('nothing to do',true);
@ -539,7 +539,7 @@ begin
try try
Client:=TFPHTTPClient.Create(nil); Client:=TFPHTTPClient.Create(nil);
Response:=TMemoryStream.Create; Response:=TMemoryStream.Create;
URL:=BaseURL+EscapeDocumentName('Image:'+WikiInternalLinkToPage(Link)); URL:=BaseURL+EscapeDocumentName('Image:'+WikiTitleToPage(Link));
writeln('getting image page "',URL,'" ...'); writeln('getting image page "',URL,'" ...');
Client.Get(URL,Response); Client.Get(URL,Response);
//Client.ResponseHeaders.SaveToFile('responseheaders.txt'); //Client.ResponseHeaders.SaveToFile('responseheaders.txt');
@ -722,13 +722,15 @@ procedure TWikiGet.Test;
Filename: String; Filename: String;
begin begin
debugln(['TWikiGet.Test [',URL,']']); debugln(['TWikiGet.Test [',URL,']']);
Page:=WikiInternalLinkToPage(URL); Page:=WikiTitleToPage(URL);
debugln([' URL=[',dbgstr(URL),'] Page=[',Page,']']); debugln([' URL=[',dbgstr(URL),'] Page=[',Page,']']);
Filename:=WikiImageToFilename(Page,false,true); Filename:=WikiImageToFilename(Page,false,true);
debugln([' URL=[',dbgstr(URL),'] Filename="',Filename,'"']); debugln([' URL=[',dbgstr(URL),'] Filename="',Filename,'"']);
end; end;
begin begin
TestWikiPageToFilename;
//w('Image:Acs_demos.jpg'); //w('Image:Acs_demos.jpg');
//w('Image:Acs demos.jpg'); //w('Image:Acs demos.jpg');
w('Image:Acs%20demos.jpg'); w('Image:Acs%20demos.jpg');
@ -790,9 +792,8 @@ end;
var var
Application: TWikiGet; Application: TWikiGet;
begin begin
//TestWikiPageToFilename;
Application:=TWikiGet.Create(nil); Application:=TWikiGet.Create(nil);
//Application.Test;
Application.Title:='Wiki Get'; Application.Title:='Wiki Get';
Application.Run; Application.Run;
Application.Free; Application.Free;