lazutf8: utf8trim: trim no break spaces

git-svn-id: trunk@35903 -
This commit is contained in:
mattias 2012-03-12 19:42:05 +00:00
parent 0e0af79ac7
commit 9bb0b8ea71

View File

@ -74,6 +74,7 @@ type
u8tKeepEnd, u8tKeepEnd,
u8tKeepTabs, u8tKeepTabs,
u8tKeepLineBreaks, u8tKeepLineBreaks,
u8tKeepNoBreakSpaces,
u8tKeepControlCodes // excluding tabs and line breaks u8tKeepControlCodes // excluding tabs and line breaks
); );
TUTF8TrimFlags = set of TUTF8TrimFlag; TUTF8TrimFlags = set of TUTF8TrimFlag;
@ -2328,9 +2329,11 @@ var
u: Cardinal; u: Cardinal;
StartP: PtrUInt; StartP: PtrUInt;
l: Integer; l: Integer;
KeepAllNonASCII: boolean;
begin begin
Result:=s; Result:=s;
if Result='' then exit; if Result='' then exit;
KeepAllNonASCII:=[u8tKeepControlCodes,u8tKeepNoBreakSpaces]*Flags=[u8tKeepControlCodes,u8tKeepNoBreakSpaces];
if not (u8tKeepStart in Flags) then begin if not (u8tKeepStart in Flags) then begin
// trim start // trim start
p:=PChar(Result); p:=PChar(Result);
@ -2356,13 +2359,18 @@ begin
break; break;
#128..#255: #128..#255:
begin begin
if u8tKeepControlCodes in Flags then break; if KeepAllNonASCII then break;
u:=UTF8CharacterToUnicode(p,l); u:=UTF8CharacterToUnicode(p,l);
if (l<=1) then break; // invalid character if (l<=1) then break; // invalid character
case u of case u of
128..159, // C1 set of control codes 128..159, // C1 set of control codes
8206, 8207: // left-to-right, right-to-left mark 8206, 8207: // left-to-right, right-to-left mark
; if u8tKeepControlCodes in Flags then break;
160, // no break space
$2007, // figure space
$2026, // narrow no-break space
$FEFF: // zero with no-break space
if u8tKeepNoBreakSpaces in Flags then break;
else else
break; break;
end; end;
@ -2398,17 +2406,23 @@ begin
break; break;
#128..#255: #128..#255:
begin begin
if u8tKeepControlCodes in Flags then break; if KeepAllNonASCII then break;
StartP:=UTF8FindNearestCharStart(PChar(Result),length(Result),p-PChar(Result)); StartP:=UTF8FindNearestCharStart(PChar(Result),length(Result),p-PChar(Result));
u:=UTF8CharacterToUnicode(PChar(Result)+StartP,l); u:=UTF8CharacterToUnicode(PChar(Result)+StartP,l);
if (l<=1) then break; // invalid character if (l<=1) then break; // invalid character
case u of case u of
128..159, // C1 set of control codes 128..159, // C1 set of control codes
8206, 8207: // left-to-right, right-to-left mark 8206, 8207: // left-to-right, right-to-left mark
p:=PChar(Result)+StartP; if u8tKeepControlCodes in Flags then break;
160, // no break space
$2007, // figure space
$2026, // narrow no-break space
$FEFF: // zero with no-break space
if u8tKeepNoBreakSpaces in Flags then break;
else else
break; break;
end; end;
p:=PChar(Result)+StartP;
end; end;
else else
break; break;