From 4c744032d9cd58b8cf5b3b6e7c38684897d3019c Mon Sep 17 00:00:00 2001 From: michael Date: Sat, 19 Aug 2006 09:38:12 +0000 Subject: [PATCH] * Some fixes from Sergei Gorelkin, plus RFC3986 additions git-svn-id: trunk@4437 - --- packages/base/netdb/testuri.pp | 111 +++++++- packages/base/netdb/uriparser.pp | 427 ++++++++++++++++++++++--------- 2 files changed, 407 insertions(+), 131 deletions(-) diff --git a/packages/base/netdb/testuri.pp b/packages/base/netdb/testuri.pp index 2f8b570708..24517d7982 100644 --- a/packages/base/netdb/testuri.pp +++ b/packages/base/netdb/testuri.pp @@ -12,16 +12,20 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. **********************************************************************} -{$MODE objfpc} -{$H+} +program TestUri; -program Testuri; +{$IFDEF FPC} +{$MODE OBJFPC}{$H+} +{$ENDIF} -uses URIParser; +uses + uriparser; var URI: TURI; s: String; + +procedure TestParse; begin with URI do begin @@ -34,12 +38,15 @@ begin Document := 'some index.html'; Params := 'param1=value1¶m2=value2'; Bookmark := 'bookmark'; + HasAuthority := True; end; s := EncodeURI(URI); WriteLn(s); + Finalize(URI); FillChar(URI, SizeOf(URI), #0); + Writeln; // URI := ParseURI(s, 'defaultprotocol', 1234); URI:=ParseURI('http://www.lazarus.freepascal.org/main.php'); @@ -55,5 +62,101 @@ begin WriteLn('Params: ', Params); WriteLn('Bookmark: ', Bookmark); end; +end; +type + urirec = record + a, b: string + end; + +const + Base = 'http://a/b/c/d;p?q'; + + tests: array[0..22] of urirec = ( + (a: 'g:h'; b: 'g:h'), + (a: 'g'; b: 'http://a/b/c/g'), + (a: './g'; b: 'http://a/b/c/g'), + (a: 'g/'; b: 'http://a/b/c/g/'), + (a: '/g'; b: 'http://a/g'), + (a: '//g'; b: 'http://g'), + (a: '?y'; b: 'http://a/b/c/d;p?y'), + (a: 'g?y'; b: 'http://a/b/c/g?y'), + (a: '#s'; b: 'http://a/b/c/d;p?q#s'), + (a: 'g#s'; b: 'http://a/b/c/g#s'), + (a: 'g?y#s'; b: 'http://a/b/c/g?y#s'), + (a: ';x'; b: 'http://a/b/c/;x'), + (a: 'g;x'; b: 'http://a/b/c/g;x'), + (a: 'g;x?y#s'; b: 'http://a/b/c/g;x?y#s'), + (a: ''; b: 'http://a/b/c/d;p?q'), + (a: '.'; b: 'http://a/b/c/'), + (a: './'; b: 'http://a/b/c/'), + (a: '..'; b: 'http://a/b/'), + (a: '../'; b: 'http://a/b/'), + (a: '../g'; b: 'http://a/b/g'), + (a: '../..'; b: 'http://a/'), + (a: '../../'; b: 'http://a/'), + (a: '../../g'; b: 'http://a/g') + ); + + tests1: array[0..1] of urirec = ( + (a: '../../../g'; b: 'http://a/g'), + (a: '../../../../g'; b: 'http://a/g') + ); + + tests2: array[0..5] of urirec = ( + (a: '/./g'; b: 'http://a/g'), + (a: '/../g'; b: 'http://a/g'), + (a: 'g.'; b: 'http://a/b/c/g.'), + (a: '.g'; b: 'http://a/b/c/.g'), + (a: 'g..'; b: 'http://a/b/c/g..'), + (a: '..g'; b: 'http://a/b/c/..g') + ); + + tests3: array[0..5] of urirec = ( + (a: './../g'; b: 'http://a/b/g'), + (a: './g/.'; b: 'http://a/b/c/g/'), + (a: 'g/./h'; b: 'http://a/b/c/g/h'), + (a: 'g/../h'; b: 'http://a/b/c/h'), + (a: 'g;x=1/./y'; b: 'http://a/b/c/g;x=1/y'), + (a: 'g;x=1/../y'; b: 'http://a/b/c/y') + ); + + tests4: array[0..3] of urirec = ( + (a: 'g?y/./x'; b: 'http://a/b/c/g?y/./x'), + (a: 'g?y/../x'; b: 'http://a/b/c/g?y/../x'), + (a: 'g#s/./x'; b: 'http://a/b/c/g#s/./x'), + (a: 'g#s/../x'; b: 'http://a/b/c/g#s/../x') + ); + +procedure Test(const Caption: string; const t: array of urirec); +var + rslt: UTF8String; + i: Integer; + Failed: Boolean; +begin + write(Caption, '...'); + Failed := False; + for i := low(t) to high(t) do + begin + ResolveRelativeUri(Base, t[i].a, rslt); + if rslt <> t[i].b then + begin + if not Failed then writeln; + Failed := True; + writeln('Test ', i, ' mismatch, expected: ''', t[i].b, '''; got: ''', rslt, ''''); + end; + end; + if not Failed then + writeln(' OK'); +end; + +begin + TestParse; + Writeln; + Writeln('Now testing relative URI resolving:'); + Test('Normal tests', tests); + Test('URI authority is not changed by using dot segments', tests1); + Test('Dot segments are removed only if they are complete path components', tests2); + Test('Testing some nonsensical forms of URI', tests3); + Test('Testing dot segments present in query or fragments', tests4); end. diff --git a/packages/base/netdb/uriparser.pp b/packages/base/netdb/uriparser.pp index fe4cf3f924..170e257dd4 100644 --- a/packages/base/netdb/uriparser.pp +++ b/packages/base/netdb/uriparser.pp @@ -13,8 +13,10 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. **********************************************************************} +{$IFDEF FPC} {$MODE objfpc} {$H+} +{$ENDIF} unit URIParser; @@ -31,42 +33,55 @@ type Document: String; Params: String; Bookmark: String; + HasAuthority: Boolean; end; function EncodeURI(const URI: TURI): String; -function ParseURI(const URI: String): TURI; -function ParseURI(const URI, DefaultProtocol: String; DefaultPort: Word): TURI; +function ParseURI(const URI: String): TURI; overload; +function ParseURI(const URI, DefaultProtocol: String; DefaultPort: Word): TURI; overload; +function ResolveRelativeURI(const BaseUri, RelUri: WideString; + out ResultUri: WideString): Boolean; overload; + +function ResolveRelativeURI(const BaseUri, RelUri: UTF8String; + out ResultUri: UTF8String): Boolean; overload; + +function URIToFilename(const URI: string; out Filename: string): Boolean; +function FilenameToURI(const Filename: string): string; + +function IsAbsoluteURI(const UriReference: string): Boolean; implementation uses SysUtils; const - HexTable: array[0..15] of Char = '0123456789abcdef'; + GenDelims = [':', '/', '?', '#', '[', ']', '@']; + SubDelims = ['!', '$', '&', '''', '(', ')', '*', '+', ',', ';', '=']; + ALPHA = ['A'..'Z', 'a'..'z']; + DIGIT = ['0'..'9']; + Unreserved = ALPHA + DIGIT + ['-', '.', '_', '~']; + ValidPathChars = Unreserved + SubDelims + ['@', ':', '/']; +function Escape(const s: String; const Allowed: TSysCharSet): String; +var + i: Integer; +begin + SetLength(Result, 0); + for i := 1 to Length(s) do + if not (s[i] in Allowed) then + Result := Result + '%' + IntToHex(ord(s[i]), 2) + else + Result := Result + s[i]; +end; function EncodeURI(const URI: TURI): String; - - function Escape(const s: String): String; - var - i: Integer; - begin - SetLength(Result, 0); - for i := 1 to Length(s) do - if not (s[i] in ['0'..'9', 'A'..'Z', 'a'..'z', ',', '-', '.', '_', - '/', '\']) then - Result := Result + '%' + HexTable[Ord(s[i]) shr 4] + - HexTable[Ord(s[i]) and $f] - else - Result := Result + s[i]; - end; - +// ! if no scheme then first colon in path should be escaped begin SetLength(Result, 0); if Length(URI.Protocol) > 0 then Result := LowerCase(URI.Protocol) + ':'; - if Length(URI.Host) > 0 then + if URI.HasAuthority then begin Result := Result + '//'; if Length(URI.Username) > 0 then @@ -80,17 +95,17 @@ begin end; if URI.Port <> 0 then Result := Result + ':' + IntToStr(URI.Port); - Result := Result + Escape(URI.Path); + Result := Result + Escape(URI.Path, ValidPathChars); if Length(URI.Document) > 0 then begin - if (Length(Result) = 0) or (Result[Length(Result)] <> '/') then + if (Length(URI.Path) > 0) and ((Length(Result) = 0) or (Result[Length(Result)] <> '/')) then Result := Result + '/'; - Result := Result + Escape(URI.Document); + Result := Result + Escape(URI.Document, ValidPathChars); end; if Length(URI.Params) > 0 then - Result := Result + '?' + URI.Params; + Result := Result + '?' + Escape(URI.Params, ValidPathChars); if Length(URI.Bookmark) > 0 then - Result := Result + '#' + Escape(URI.Bookmark); + Result := Result + '#' + Escape(URI.Bookmark, ValidPathChars); end; function ParseURI(const URI: String): TURI; @@ -98,155 +113,313 @@ begin Result := ParseURI(URI, '', 0); end; -function ParseURI(const URI, DefaultProtocol: String; DefaultPort: Word): TURI; - - function Unescape(const s: String): String; - - function HexValue(c: Char): Integer; - begin - if (c >= '0') and (c <= '9') then - Result := Ord(c) - Ord('0') - else if (c >= 'A') and (c <= 'F') then - Result := Ord(c) - Ord('A') + 10 - else if (c >= 'a') and (c <= 'f') then - Result := Ord(c) - Ord('a') + 10 - else - Result := 0; - end; - - var - i, RealLength: Integer; - begin - SetLength(Result, Length(s)); - i := 1; - RealLength := 0; - while i <= Length(s) do - begin - Inc(RealLength); - if s[i] = '%' then - begin - Result[RealLength] := Chr(HexValue(s[i + 1]) shl 4 or HexValue(s[i + 2])); - Inc(i, 3); - end else - begin - Result[RealLength] := s[i]; - Inc(i); - end; - end; - SetLength(Result, RealLength); +function HexValue(c: Char): Integer; +begin + case c of + '0'..'9': Result := ord(c) - ord('0'); + 'A'..'F': Result := ord(c) - (ord('A') - 10); + 'a'..'f': Result := ord(c) - (ord('a') - 10); + else + Result := 0; end; +end; +function Unescape(const s: String): String; var - s: String; - i, LastValidPos: Integer; + i, RealLength: Integer; +begin + SetLength(Result, Length(s)); + i := 1; + RealLength := 0; + while i <= Length(s) do + begin + Inc(RealLength); + if s[i] = '%' then + begin + Result[RealLength] := Chr(HexValue(s[i + 1]) shl 4 or HexValue(s[i + 2])); + Inc(i, 3); + end else + begin + Result[RealLength] := s[i]; + Inc(i); + end; + end; + SetLength(Result, RealLength); +end; + +function ParseURI(const URI, DefaultProtocol: String; DefaultPort: Word): TURI; +var + s, Authority: String; + i: Integer; begin Result.Protocol := LowerCase(DefaultProtocol); Result.Port := DefaultPort; s := URI; - // Extract the protocol + // Extract scheme for i := 1 to Length(s) do if s[i] = ':' then begin Result.Protocol := Copy(s, 1, i - 1); - s := Copy(s, i + 1, Length(s)); - break; - end else if not (s[i] in ['0'..'9', 'A'..'Z', 'a'..'z']) then + s := Copy(s, i + 1, MaxInt); break; + end + else + if not (((i=1) and (s[i] in ALPHA)) or (s[i] in ALPHA + DIGIT + ['+', '-', '.'])) then + break; - // Extract the bookmark name + // Extract the bookmark - for i := Length(s) downto 1 do - if s[i] = '#' then - begin - Result.Bookmark := Unescape(Copy(s, i + 1, Length(s))); - s := Copy(s, 1, i - 1); - break; - end else if s[i] = '/' then - break; + i := LastDelimiter('#', s); + if i > 0 then + begin + Result.Bookmark := Unescape(Copy(s, i + 1, MaxInt)); + s := Copy(s, 1, i - 1); + end; // Extract the params - for i := Length(s) downto 1 do - if s[i] = '?' then - begin - Result.Params := Copy(s, i + 1, Length(s)); - s := Copy(s, 1, i - 1); - break; - end else if s[i] = '/' then - break; + i := LastDelimiter('?', s); + if i > 0 then + begin + Result.Params := Unescape(Copy(s, i + 1, MaxInt)); + s := Copy(s, 1, i - 1); + end; - // Extract the document name + // extract authority + + if (Length(s) > 1) and (s[1] = '/') and (s[2] = '/') then + begin + i := 3; + while (i <= Length(s)) and (s[i] <> '/') do + Inc(i); + Authority := Copy(s, 3, i-3); + s := Copy(s, i, MaxInt); + Result.HasAuthority := True; // even if Authority is empty + end + else + begin + Result.HasAuthority := False; + Authority := ''; + end; + + // now s is 'hier-part' per RFC3986 + // Extract the document name (nasty...) for i := Length(s) downto 1 do if s[i] = '/' then begin Result.Document := Unescape(Copy(s, i + 1, Length(s))); - s := Copy(s, 1, i - 1); + if (Result.Document <> '.') and (Result.Document <> '..') then + s := Copy(s, 1, i) + else + Result.Document := ''; break; end else if s[i] = ':' then - break; + break + else if i = 1 then + begin + Result.Document := Unescape(s); + if (Result.Document <> '.') and (Result.Document <> '..') then + s := '' + else + Result.Document := ''; + // break - not needed, last iteration + end; - // Extract the path + // Everything left is a path - LastValidPos := 0; - for i := Length(s) downto 1 do - if (s[i] = '/') - and ((I>1) and (S[i-1]<>'/')) - and ((I'/')) then - LastValidPos := i - else if s[i] in [':', '@'] then - break; - - if (LastValidPos > 0) and - (Length(S)>LastValidPos) and - (S[LastValidPos+1]<>'/') then - begin - Result.Path := Unescape(Copy(s, LastValidPos, Length(s))); - s := Copy(s, 1, LastValidPos - 1); - end; + Result.Path := Unescape(s); // Extract the port number - for i := Length(s) downto 1 do - if s[i] = ':' then - begin - Result.Port := StrToInt(Copy(s, i + 1, Length(s))); - s := Copy(s, 1, i - 1); - break; - end else if s[i] in ['@', '/'] then - break; + i := LastDelimiter(':@', Authority); + if (i > 0) and (Authority[i] = ':') then + begin + Result.Port := StrToInt(Copy(Authority, i + 1, MaxInt)); + Authority := Copy(Authority, 1, i - 1); + end; // Extract the hostname - if ((Length(s) > 2) and (s[1] = '/') and (s[2] = '/')) or - ((Length(s) > 1) and (s[1] <> '/')) then + i := Pos('@', Authority); + if i > 0 then begin - if s[1] <> '/' then - s := '//' + s; - for i := Length(s) downto 1 do - if s[i] in ['@', '/'] then - begin - Result.Host := Copy(s, i + 1, Length(s)); - s := Copy(s, 3, i - 3); - break; - end; + Result.Host := Copy(Authority, i+1, MaxInt); + Delete(Authority, i, MaxInt); // Extract username and password - if Length(s) > 0 then + if Length(Authority) > 0 then begin - i := Pos(':', s); + i := Pos(':', Authority); if i = 0 then - Result.Username := s + Result.Username := Authority else begin - Result.Username := Copy(s, 1, i - 1); - Result.Password := Copy(s, i + 1, Length(s)); + Result.Username := Copy(Authority, 1, i - 1); + Result.Password := Copy(Authority, i + 1, MaxInt); end; end; + end + else + Result.Host := Authority; +end; + +procedure RemoveDotSegments(var s: string); +var + Cur, Prev: Integer; +begin + Prev := Pos('/', s); + while (Prev > 0) and (Prev < Length(s)) do + begin + Cur := Prev+1; + while (Cur <= Length(s)) and (s[Cur] <> '/') do + Inc(Cur); + if (Cur - Prev = 2) and (s[Prev+1] = '.') then + Delete(s, Prev+1, 2) + else if (Cur - Prev = 3) and (s[Prev+1] = '.') and (s[Prev+2] = '.') then + begin + while (Prev > 1) and (s[Prev-1] <> '/') do + Dec(Prev); + if Prev > 1 then + Dec(Prev); + Delete(s, Prev+1, Cur-Prev); + end + else + Prev := Cur; + end; +end; + +// TODO: this probably must NOT percent-encode the result... +function ResolveRelativeURI(const BaseUri, RelUri: UTF8String; + out ResultUri: UTF8String): Boolean; +var + Base, Rel: TUri; +begin + Base := ParseUri(BaseUri); + Rel := ParseUri(RelUri); + + Result := (Base.Protocol <> '') or (Rel.Protocol <> ''); + if not Result then + Exit; + with Rel do + begin + if (Path = '') and (Document = '') then + begin + if (Protocol = '') and (Host = '') then + begin + if Params <> '' then + Base.Params := Params; + Base.Bookmark := Bookmark; + ResultUri := EncodeUri(Base); + Exit; + end; + end; + if (Protocol <> '') then // RelURI is absolute - return it... + begin + ResultUri := RelUri; + Exit; + end; + // Inherit protocol + Protocol := Base.Protocol; + if (Host = '') then // TODO: or "not HasAuthority"? + begin + // Inherit Authority (host, port, username, password) + Host := Base.Host; + Port := Base.Port; + Username := Base.Username; + Password := Base.Password; + HasAuthority := Base.HasAuthority; + if (Path = '') or (Path[1] <> '/') then // path is empty or relative + Path := Base.Path + Path; + RemoveDotSegments(Path); + end; + end; // with + ResultUri := EncodeUri(Rel); +end; + +function ResolveRelativeURI(const BaseUri, RelUri: WideString; + out ResultUri: WideString): Boolean; +var + rslt: UTF8String; +begin + Result := ResolveRelativeURI(UTF8Encode(BaseUri), UTF8Encode(RelUri), rslt); + if Result then + ResultURI := UTF8Decode(rslt); +end; + +function URIToFilename(const URI: string; out Filename: string): Boolean; +var + U: TURI; + I: Integer; +begin + Result := False; + U := ParseURI(URI); + if SameText(U.Protocol, 'file') then + begin + if (Length(U.Path) > 2) and (U.Path[1] = '/') and (U.Path[3] = ':') then + Filename := Copy(U.Path, 2, MaxInt) + else + Filename := U.Path; + Filename := Filename + U.Document; + Result := True; + end + else + if U.Protocol = '' then // fire and pray? + begin + Filename := U.Path + U.Document; + Result := True; + end; + if PathDelim <> '/' then + begin + I := Pos('/', Filename); + while I > 0 do + begin + Filename[I] := PathDelim; + I := Pos('/', Filename); + end; end; end; +function FilenameToURI(const Filename: string): string; +var + I: Integer; +begin + // TODO: seems implemented, but not tested well + Result := 'file://'; + if (Length(Filename) > 2) and (Filename[1] <> PathDelim) and (Filename[2] = ':') then + Result := Result + '/'; + Result := Result + Filename; + if PathDelim <> '/' then + begin + I := Pos(PathDelim, Result); + while I <> 0 do + begin + Result[I] := '/'; + I := Pos(PathDelim, Result); + end; + end; +end; + + +function IsAbsoluteURI(const UriReference: string): Boolean; +var + I: Integer; +begin + Result := True; + for I := 1 to Length(UriReference) do + begin + if UriReference[I] = ':' then + Exit + else + if not (((I=1) and (UriReference[I] in ALPHA)) or + (UriReference[i] in ALPHA + DIGIT + ['+', '-', '.'])) then + Break; + end; + Result := False; +end; + + end.