lazutf8: added overloaded UTF8FixBroken(var string)

git-svn-id: trunk@36319 -
This commit is contained in:
mattias 2012-03-25 16:24:14 +00:00
parent 98e9e68885
commit f0f090fb65
4 changed files with 17 additions and 20 deletions

View File

@ -54,7 +54,8 @@ function UTF8FindNearestCharStart(UTF8Str: PChar; Len: integer;
function UTF8CharStart(UTF8Str: PChar; Len, CharIndex: PtrInt): PChar;
// find the byte index of the n-th UTF8 character, ignoring BIDI (byte len of substr)
function UTF8CharToByteIndex(UTF8Str: PChar; Len, CharIndex: PtrInt): PtrInt;
procedure UTF8FixBroken(P: PChar);
procedure UTF8FixBroken(P: PChar); overload;
procedure UTF8FixBroken(var S: string); overload;
function UTF8CharacterStrictLength(P: PChar): integer;
function UTF8CStringToUTF8String(SourceStart: PChar; SourceLen: PtrInt) : string;
function UTF8Pos(const SearchForText, SearchInText: string): PtrInt;
@ -523,7 +524,7 @@ begin
c:=((ord(p^) and %00011111) shl 6);
//or (ord(p[1]) and %00111111);
if c<(1 shl 7) then
p^:=' '
p^:=' ' // fix XSS attack
else
inc(p,2)
end
@ -538,7 +539,7 @@ begin
or ((ord(p[1]) and %00111111) shl 6);
//or (ord(p[2]) and %00111111);
if c<(1 shl 11) then
p^:=' '
p^:=' ' // fix XSS attack
else
inc(p,3);
end else
@ -554,7 +555,7 @@ begin
or ((ord(p[2]) and %00111111) shl 6);
//or (ord(p[3]) and %00111111);
if c<(1 shl 16) then
p^:=' '
p^:=' ' // fix XSS attack
else
inc(p,4)
end else
@ -567,6 +568,14 @@ begin
end;
end;
procedure UTF8FixBroken(var S: string);
begin
if S='' then exit;
if FindInvalidUTF8Character(PChar(S),length(S),true)<0 then exit;
UniqueString(S);
UTF8FixBroken(PChar(S));
end;
function UTF8CharacterStrictLength(P: PChar): integer;
begin
if p=nil then exit(0);

View File

@ -1499,18 +1499,8 @@ begin
end;
procedure TWikiPage.FixUTF8;
var
p: PChar;
e: PChar;
begin
if FSrc='' then exit;
UniqueString(FSrc);
p:=PChar(FSrc);
e:=p+length(FSrc);
while p<e do begin
UTF8FixBroken(p);
inc(p,UTF8CharacterLength(p));
end;
UTF8FixBroken(FSrc);
end;
procedure Init;

View File

@ -278,7 +278,7 @@ It returns 0 if the codepoint can not be represented as a 1 to 4 byte UTF-8 sequ
</element>
<!-- procedure Visibility: default -->
<element name="UTF8FixBroken">
<short/>
<short>Replaces all invalid UTF8 characters with spaces. Stops at #0.</short>
<descr/>
<errors/>
<seealso/>

View File

@ -1706,10 +1706,8 @@ begin
for i:=1 to length(Result) do
if Result[i] in [#0..#31,#127] then Result[i]:=' ';
if Result='' then exit;
if FixUTF8 then begin
UniqueString(Result);
UTF8FixBroken(PChar(Result));
end;
if FixUTF8 then
UTF8FixBroken(Result);
Result:=UTF8Trim(Result);
end;