lazarus-ccr/components/fpspreadsheet/source/common/fpshtmlutils.pas
2016-11-06 16:07:29 +00:00

1192 lines
38 KiB
ObjectPascal
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

unit fpsHTMLUtils;
{$mode objfpc}{$H+}
interface
uses
Classes, SysUtils, contnrs, fpstypes, fpspreadsheet;
type
TsHTMLEntity = record
E: String;
Ch: String;
N: Word;
end;
function CleanHTMLString(AText: String): String;
function IsHTMLEntity(AText: PChar; out AEntity: TsHTMLEntity): Boolean;
function RemoveHTMLEntities(const AText: String): String;
type
TsHTMLAttr = class
Name: String;
Value: String;
constructor Create(AName, AValue: String);
end;
TsHTMLAttrList = class(TObjectList)
private
function GetItem(AIndex: Integer): TsHTMLAttr;
procedure SetItem(AIndex: Integer; AValue: TsHTMLAttr);
protected
procedure ParseStyle(AStyle: String);
public
function IndexOfName(AName: String): Integer;
procedure Parse(AHTML: String);
property Items[AIndex: Integer]: TsHTMLAttr read GetItem write SetItem; default;
end;
TsTagCase = (tcLowercase, tcUppercase, tcProperCase);
procedure HTMLToRichText(AWorkbook: TsWorkbook; AFont: TsFont;
const AHTMLText: String; out APlainText: String;
out ARichTextParams: TsRichTextParams);
procedure RichTextToHTML(AWorkbook: TsWorkbook; AFont: TsFont;
const APlainText: String; const ARichTextParams: TsRichTextParams;
out AHTMLText: String; APrefix:String = ''; ATagCase: TsTagCase = tcLowercase);
implementation
uses
math, lazUtf8, fasthtmlparser,
fpsUtils, fpsClasses;
const
// http://unicode.e-workers.de/entities.php
// http://www.utf8-zeichentabelle.de/unicode-utf8-table.pl
HTMLEntities: array[0..250] of TsHTMLEntity = (
// A
(E: 'Acirc'; Ch: 'Â'; N: 194), // 0
(E: 'acirc'; Ch: 'â'; N: 226),
(E: 'acute'; Ch: '´'; N: 180),
(E: 'AElig'; Ch: 'Æ'; N: 198),
(E: 'aelig'; Ch: 'æ'; N: 230),
(E: 'Agrave'; Ch: 'À'; N: 192),
(E: 'agrave'; Ch: 'à'; N: 224),
(E: 'alefsym';Ch: #$E2#$84#$B5; N: 8501),
(E: 'Alpha'; Ch: 'Α'; N: 913),
(E: 'alpha'; Ch: 'α'; N: 945),
(E: 'amp'; Ch: '&'; N: 38), // 10
(E: 'and'; Ch: '∧'; N: 8743),
(E: 'ang'; Ch: '∠'; N: 8736),
(E: 'apos'; Ch: ''''; N: 39),
(E: 'Aring'; Ch: 'Å'; N: 197),
(E: 'aring'; Ch: 'å'; N: 229),
(E: 'asymp'; Ch: '≈'; N: 2248),
(E: 'Atilde'; Ch: 'Ã'; N: 195),
(E: 'atilde'; Ch: 'ã'; N: 227),
(E: 'Auml'; Ch: 'Ä'; N: 196),
(E: 'auml'; Ch: 'ä'; N: 228), // 20
// B
(E: 'bdquo'; Ch: '„'; N: 8222), // 21
(E: 'Beta'; Ch: 'Β'; N: 914),
(E: 'beta'; Ch: 'β'; N: 946),
(E: 'brvbar'; Ch: '¦'; N: 166),
(E: 'bull'; Ch: '•'; N: 8226),
// C
(E: 'cap'; Ch: '∩'; N: 8745), // 26
(E: 'Ccedil'; Ch: 'Ç'; N: 199),
(E: 'ccedil'; Ch: 'ç'; N: 231),
(E: 'cedil'; Ch: '¸'; N: 184),
(E: 'cent'; Ch: '¢'; N: 162), // 30
(E: 'Chi'; Ch: 'Χ'; N: 935),
(E: 'chi'; Ch: 'χ'; N: 967),
(E: 'circ'; Ch: 'ˆ'; N: 710),
(E: 'clubs'; Ch: '♣'; N: 9827),
(E: 'cong'; Ch: #$E2#$89#$85; N: 8773), // approximately equal
(E: 'copy'; Ch: '©'; N: 169),
(E: 'crarr'; Ch: #$E2#$86#$B5; N: 8629), // carriage return
(E: 'cup'; Ch: ''; N: 8746),
(E: 'curren'; Ch: '¤'; N: 164),
// D
(E: 'Dagger'; Ch: '‡'; N: 8225), // 40
(E: 'dagger'; Ch: '†'; N: 8224),
(E: 'dArr'; Ch: #$E2#$87#$93; N: 8659), // wide down-arrow
(E: 'darr'; Ch: '↓'; N: 8595), // narrow down-arrow
(E: 'deg'; Ch: '°'; N: 176),
(E: 'Delta'; Ch: 'Δ'; N: 916),
(E: 'delta'; Ch: 'δ'; N: 948),
(E: 'diams'; Ch: '♦'; N: 9830),
(E: 'divide'; Ch: '÷'; N: 247),
// E
(E: 'Eacute'; Ch: 'É'; N: 201), // 49
(E: 'eacute'; Ch: 'é'; N: 233), // 50
(E: 'Ecirc'; Ch: 'Ê'; N: 202),
(E: 'ecirc'; Ch: 'ê'; N: 234),
(E: 'Egrave'; Ch: 'È'; N: 200),
(E: 'egrave'; Ch: 'è'; N: 232),
(E: 'empty'; Ch: #$e2#$88#$85; N: 8709),
(E: 'emsp'; Ch: #$E2#$80#$83; N: 8195), // Space character width of "m"
(E: 'ensp'; Ch: #$E2#$80#$82; N: 8194), // Space character width of "n"
(E: 'Epsilon';Ch: 'Ε'; N: 917), // capital epsilon
(E: 'epsilon';Ch: 'ε'; N: 949),
(E: 'equiv'; Ch: '≡'; N: 8801), // 60
(E: 'Eta'; Ch: 'Η'; N: 919),
(E: 'eta'; Ch: 'η'; N: 951),
(E: 'ETH'; Ch: 'Ð'; N: 208),
(E: 'eth'; Ch: 'ð'; N: 240),
(E: 'Euml'; Ch: 'Ë'; N: 203),
(E: 'euml'; Ch: 'ë'; N: 235),
(E: 'euro'; Ch: '€'; N: 8364),
(E: 'exist'; Ch: '∃'; N: 8707),
// F
(E: 'fnof'; Ch: 'ƒ'; N: 402), // 70
(E: 'forall'; Ch: '∀'; N: 8704),
(E: 'frac12'; Ch: '½'; N: 189),
(E: 'frac14'; Ch: '¼'; N: 188),
(E: 'frac34'; Ch: '¾'; N: 190),
(E: 'frasl'; Ch: ''; N: 8260),
// G
(E: 'Gamma'; Ch: 'Γ'; N: 915),
(E: 'gamma'; Ch: 'γ'; N: 947),
(E: 'ge'; Ch: '≥'; N: 8805),
(E: 'gt'; Ch: '>'; N: 62),
// H
(E: 'hArr'; Ch: '⇔'; N: 8660), // 80, wide horizontal double arrow
(E: 'harr'; Ch: '↔'; N: 8596), // narrow horizontal double arrow
(E: 'hearts'; Ch: '♥'; N: 9829),
(E: 'hellip'; Ch: '…'; N: 8230),
// I
(E: 'Iacute'; Ch: 'Í'; N: 205),
(E: 'iacute'; Ch: 'í'; N: 237),
(E: 'Icirc'; Ch: 'Î'; N: 206),
(E: 'icirc'; Ch: 'î'; N: 238),
(E: 'iexcl'; Ch: '¡'; N: 161),
(E: 'Igrave'; Ch: 'Ì'; N: 204),
(E: 'igrave'; Ch: 'ì'; N: 236), // 90
(E: 'image'; Ch: #$E2#$84#$91; N: 2465), // I in factura
(E: 'infin'; Ch: '∞'; N: 8734),
(E: 'int'; Ch: '∫'; N: 8747),
(E: 'Iota'; Ch: 'Ι'; N: 921),
(E: 'iota'; Ch: 'ι'; N: 953),
(E: 'iquest'; Ch: '¿'; N: 191),
(E: 'isin'; Ch: '∈'; N: 8712),
(E: 'Iuml'; Ch: 'Ï'; N: 207),
(E: 'iuml'; Ch: 'ï'; N: 239),
// K
(E: 'Kappa'; Ch: 'Κ'; N: 922), // 100
(E: 'kappa'; Ch: 'κ'; N: 254),
// L
(E: 'Lambda'; Ch: 'Λ'; N: 923),
(E: 'lambda'; Ch: 'λ'; N: 955),
(E: 'lang'; Ch: '⟨'; N: 9001), // Left-pointing angle bracket
(E: 'laquo'; Ch: '«'; N: 171),
(E: 'lArr'; Ch: #$E2#$87#$90; N: 8656), // Left-pointing wide arrow
(E: 'larr'; Ch: '←'; N: 8592),
(E: 'lceil'; Ch: '⌈'; N: 8968), // Left ceiling
(E: 'ldquo'; Ch: '“'; N: 8220),
(E: 'le'; Ch: '≤'; N: 8804), // 110
(E: 'lfloor'; Ch: '⌊'; N: 8970), // Left floor
(E: 'lowast'; Ch: #$e2#$88#$97; N: 8727), // Low asterisk
(E: 'loz'; Ch: '◊'; N: 9674),
(E: 'lrm'; Ch: ''; N: 8206), // Left-to-right mark
(E: 'lsaquo'; Ch: ''; N: 8249),
(E: 'lsquo'; Ch: ''; N: 8216),
(E: 'lt'; Ch: '<'; N: 60),
// M
(E: 'macr'; Ch: '¯'; N: 175),
(E: 'mdash'; Ch: '—'; N: 8212),
(E: 'micro'; Ch: 'µ'; N: 181), // 120
(E: 'middot'; Ch: '·'; N: 183),
(E: 'minus'; Ch: ''; N: 8722),
(E: 'Mu'; Ch: 'Μ'; N: 924),
(E: 'mu'; Ch: 'μ'; N: 956),
// N
(E: 'nabla'; Ch: '∇'; N: 8711),
(E: 'nbsp'; Ch: ' '; N: 160), // 126
(E: 'ndash'; Ch: ''; N: 8211),
(E: 'ne'; Ch: '≠'; N: 8800),
(E: 'ni'; Ch: '∋'; N: 8715),
(E: 'not'; Ch: '¬'; N: 172), // 130
(E: 'notin'; Ch: #$e2#$88#$89; N: 8713), // math: "not in"
(E: 'nsub'; Ch: #$e2#$8a#$84; N: 8836), // math: "not a subset of"
(E: 'Ntilde'; Ch: 'Ñ'; N: 209),
(E: 'ntilde'; Ch: 'ñ'; N: 241),
(E: 'Nu'; Ch: 'Ν'; N: 925),
(E: 'nu'; Ch: 'ν'; N: 957),
// O
(E: 'Oacute'; Ch: 'Ó'; N: 211),
(E: 'oacute'; Ch: 'ó'; N: 243),
(E: 'Ocirc'; Ch: 'Ô'; N: 212),
(E: 'ocirc'; Ch: 'ô'; N: 244),
(E: 'OElig'; Ch: 'Œ'; N: 338),
(E: 'oelig'; Ch: 'œ'; N: 339),
(E: 'Ograve'; Ch: 'Ò'; N: 210),
(E: 'ograve'; Ch: 'ò'; N: 242),
(E: 'oline'; Ch: '‾'; N: 8254),
(E: 'Omega'; Ch: 'Ω'; N: 937),
(E: 'omega'; Ch: 'ω'; N: 969),
(E: 'Omicron';Ch: 'Ο'; N: 927),
(E: 'omicron';Ch: 'ο'; N: 959),
(E: 'oplus'; Ch: #$e2#$8a#$95; N: 8853), // Circled plus
(E: 'or'; Ch: ''; N: 8744),
(E: 'ordf'; Ch: 'ª'; N: 170),
(E: 'ordm'; Ch: 'º'; N: 186),
(E: 'Oslash'; Ch: 'Ø'; N: 216),
(E: 'oslash'; Ch: 'ø'; N: 248),
(E: 'Otilde'; Ch: 'Õ'; N: 213),
(E: 'otilde'; Ch: 'õ'; N: 245),
(E: 'otimes'; Ch: #$E2#$8A#$97; N: 8855), // Circled times
(E: 'Ouml'; Ch: 'Ö'; N: 214),
(E: 'ouml'; Ch: 'ö'; N: 246),
// P
(E: 'para'; Ch: '¶'; N: 182),
(E: 'part'; Ch: '∂'; N: 8706),
(E: 'permil'; Ch: '‰'; N: 8240),
(E: 'perp'; Ch: '⊥'; N: 8869),
(E: 'Phi'; Ch: 'Φ'; N: 934),
(E: 'phi'; Ch: 'φ'; N: 966),
(E: 'Pi'; Ch: 'Π'; N: 928),
(E: 'pi'; Ch: 'π'; N: 960), // lower-case pi
(E: 'piv'; Ch: 'ϖ'; N: 982),
(E: 'plusmn'; Ch: '±'; N: 177),
(E: 'pound'; Ch: '£'; N: 163),
(E: 'Prime'; Ch: '″'; N: 8243),
(E: 'prime'; Ch: ''; N: 8242),
(E: 'prod'; Ch: '∏'; N: 8719),
(E: 'prop'; Ch: '∝'; N: 8733),
(E: 'Psi'; Ch: 'Ψ'; N: 936),
(E: 'psi'; Ch: 'ψ'; N: 968),
// Q
(E: 'quot'; Ch: '"'; N: 34),
// R
(E: 'radic'; Ch: '√'; N: 8730),
(E: 'rang'; Ch: '⟩'; N: 9002), // right-pointing angle bracket
(E: 'raquo'; Ch: '»'; N: 187),
(E: 'rArr'; Ch: '⇒'; N: 8658),
(E: 'rarr'; Ch: '→'; N: 8594),
(E: 'rceil'; Ch: '⌉'; N: 8969), // right ceiling
(E: 'rdquo'; Ch: '”'; N: 8221),
(E: 'real'; Ch: #$E2#$84#$9C; N: 8476), // R in factura
(E: 'reg'; Ch: '®'; N: 174),
(E: 'rfloor'; Ch: '⌋'; N: 8971), // Right floor
(E: 'Rho'; Ch: 'Ρ'; N: 929),
(E: 'rho'; Ch: 'ρ'; N: 961),
(E: 'rlm'; Ch: ''; N: 8207), // right-to-left mark
(E: 'rsaquo'; Ch: ''; N: 8250),
(E: 'rsquo'; Ch: ''; N: 8217),
// S
(E: 'sbquo'; Ch: ''; N: 8218),
(E: 'Scaron'; Ch: 'Š'; N: 352),
(E: 'scaron'; Ch: 'š'; N: 353),
(E: 'sdot'; Ch: #$E2#$8B#$85; N: 8901), // math: dot operator
(E: 'sect'; Ch: '§'; N: 167),
(E: 'shy'; Ch: #$C2#$AD; N: 173), // conditional hyphen
(E: 'Sigma'; Ch: 'Σ'; N: 931),
(E: 'sigma'; Ch: 'σ'; N: 963),
(E: 'sigmaf'; Ch: 'ς'; N: 962),
(E: 'sim'; Ch: #$E2#$88#$BC; N: 8764), // similar
(E: 'spades'; Ch: '♠'; N: 9824),
(E: 'sub'; Ch: '⊂'; N: 8834),
(E: 'sube'; Ch: '⊆'; N: 8838),
(E: 'sum'; Ch: '∑'; N: 8721),
(E: 'sup'; Ch: '⊃'; N: 8835),
(E: 'sup1'; Ch: '¹'; N: 185),
(E: 'sup2'; Ch: '²'; N: 178),
(E: 'sup3'; Ch: '³'; N: 179),
(E: 'supe'; Ch: '⊇'; N: 8839),
(E: 'szlig'; Ch: 'ß'; N: 223),
//T
(E: 'Tau'; Ch: 'Τ'; N: 932),
(E: 'tau'; Ch: 'τ'; N: 964),
(E: 'there4'; Ch: '∴'; N: 8756),
(E: 'Theta'; Ch: 'Θ'; N: 920),
(E: 'theta'; Ch: 'θ'; N: 952),
(E: 'thetasym';Ch: 'ϑ'; N: 977),
(E: 'thinsp'; Ch: #$E2#$80#$89; N: 8201), // thin space
(E: 'THORN'; Ch: 'Þ'; N: 222),
(E: 'thorn'; Ch: 'þ'; N: 254),
(E: 'tilde'; Ch: '˜'; N: 732),
(E: 'times'; Ch: '×'; N: 215),
(E: 'trade'; Ch: '™'; N: 8482),
// U
(E: 'Uacute'; Ch: 'Ú'; N: 218),
(E: 'uacute'; Ch: 'ú'; N: 250),
(E: 'uArr'; Ch: #$E2#$87#$91; N: 8657), // wide up-arrow
(E: 'uarr'; Ch: '↑'; N: 8593),
(E: 'Ucirc'; Ch: 'Û'; N: 219),
(E: 'ucirc'; Ch: 'û'; N: 251),
(E: 'Ugrave'; Ch: 'Ù'; N: 217),
(E: 'ugrave'; Ch: 'ù'; N: 249),
(E: 'uml'; Ch: '¨'; N: 168),
(E: 'upsih'; Ch: 'ϒ'; N: 978),
(E: 'Upsilon';Ch: 'Υ'; N: 933),
(E: 'upsilon';Ch: 'υ'; N: 965),
(E: 'Uuml'; Ch: 'Ü'; N: 220),
(E: 'uuml'; Ch: 'ü'; N: 252),
// W
(E: 'weierp'; Ch: #$E2#$84#$98; N: 8472), // Script Capital P; Weierstrass Elliptic Function
// X
(E: 'Xi'; Ch: 'Ξ'; N: 926),
(E: 'xi'; Ch: 'ξ'; N: 958),
// Y
(E: 'Yacute'; Ch: 'Ý'; N: 221),
(E: 'yacute'; Ch: 'ý'; N: 253),
(E: 'yen'; Ch: '¥'; N: 165),
(E: 'Yuml'; Ch: 'Ÿ'; N: 376),
(E: 'yuml'; Ch: 'ÿ'; N: 255),
// Z
(E: 'Zeta'; Ch: 'Ζ'; N: 918),
(E: 'zeta'; Ch: 'ζ'; N: 950),
(E: 'zwj'; Ch: ''; N: 8205), // Zero-width joiner
(E: 'zwnj'; Ch: ''; N: 8204) // Zero-width non-joiner
);
function IsHTMLEntity(AText: PChar; out AEntity: TsHTMLEntity): Boolean;
function Compare(s: String): Boolean;
var
j: Integer;
begin
Result := false;
for j:=1 to Length(s) do
if s[j] <> PChar(AText)[j-1] then
exit;
if PChar(AText)[Length(s)] <> ';' then
exit;
Result := true;
end;
var
k: Integer;
equ: Boolean;
ch1: Char;
P: PChar;
begin
Result := false;
for k:=0 to High(HTMLEntities) do
begin
equ := Compare(HTMLEntities[k].E);
if not equ then
begin
P := AText;
ch1 := P^;
if ch1 = '#' then
begin
inc(P);
if ch1 = 'x' then
equ := Compare(Format('#x%x', [HTMLEntities[k].N]))
else
equ := Compare(Format('#%d', [HTMLEntities[k].N]));
end;
end;
if equ then
begin
AEntity := HTMLEntities[k];
Result := true;
exit;
end;
end;
end;
function CleanHTMLString(AText: String): String;
var
ent: TsHTMLEntity;
P: PChar;
ch: Char;
hasStartSpace, hasEndSpace: Boolean;
begin
Result := '';
// Remove leading and trailing spaces and line endings coming from formatted
// source lines. Retain 1 single space, at the end even without spaces found.
// No idea if this is 100% correct - at least, looks good.
hasStartSpace := false;
while (Length(AText) > 0) and (AText[1] in [#9, #13, #10, ' ']) do
begin
if AText[1] = ' ' then hasStartSpace := true; // A leading space will be added later
Delete(AText, 1, 1);
end;
hasEndSpace := false;
while (Length(AText) > 0) and (AText[Length(AText)] in [#9, #10, #13, ' ']) do
begin
hasEndSpace := true; // A trailing space will be added later
Delete(AText, Length(AText), 1);
end;
if AText = '' then
begin
if hasStartSpace or hasEndSpace then Result := ' ';
exit;
end;
// Replace HTML entities by their counter part UTF8 characters
P := @AText[1];
while (P^ <> #0) do begin
ch := P^;
case ch of
' ': begin
// collapse multiple spaces to a single space (HTML spec)
// http://stackoverflow.com/questions/24615355/browser-white-space-rendering
Result := Result + ' ';
inc(P);
while (P^ = ' ') do inc(P);
dec(P);
end;
'&': begin
inc(P);
if (P <> nil) and IsHTMLEntity(P, ent) then
begin
Result := Result + ent.Ch;
inc(P, Length(ent.E));
end else
begin
Result := Result + '&';
Continue;
end;
end;
else
Result := Result + ch;
end;
inc(P);
end;
// Add leading and trailing spaces from above.
if hasStartSpace then Result := ' ' + Result;
if hasEndSpace then Result := Result + ' ';
end;
function RemoveHTMLEntities(const AText: String): String;
var
ent: TsHTMLEntity;
P: PChar;
ch: AnsiChar;
begin
Result := '';
P := @AText[1];
while (P^ <> #0) do begin
ch := P^;
case ch of
'&': begin
inc(P);
if (P <> nil) and IsHTMLEntity(P, ent) then
begin
Result := Result + ent.Ch;
inc(P, Length(ent.E));
end else
begin
Result := Result + '&';
Continue;
end;
end;
else Result := Result + ch;
end;
inc(P);
end;
end;
{==============================================================================}
{ TsHTMLAttr }
{==============================================================================}
constructor TsHTMLAttr.Create(AName, AValue: String);
begin
Name := AName;
Value := AValue;
end;
{==============================================================================}
{ TsHTMLAttrList }
{==============================================================================}
function TsHTMLAttrList.GetItem(AIndex: Integer): TsHTMLAttr;
begin
Result := TsHTMLAttr(inherited GetItem(AIndex));
end;
function TsHTMLAttrList.IndexOfName(AName: String): Integer;
begin
AName := Lowercase(AName);
for Result := 0 to Count-1 do
if GetItem(Result).Name = AName then
exit;
Result := -1;
end;
{ AHTML is a HTML string beginning with a < tag. Seeks the first space to split
off the HTML tag. Then seeks for = and " characters to extract the attributes
which are split into name/value pairs at the = character. The value part is
unquoted. }
procedure TsHTMLAttrList.Parse(AHTML: String);
var
i: Integer;
len: Integer;
value, nam: String;
begin
Clear;
if (AHTML[1] <> '<') then // just for simplification
raise Exception.Create('[THTMLAttrList.Parse] HTML tags expected.');
// Find first space
i := 1;
len := Length(AHTML);
while (i <= len) and (AHTML[i] <> ' ') do inc(i);
// Parse attribute string
nam := '';
while (i <= len) do
begin
case AHTML[i] of
'=': begin
inc(i);
value := '';
if AHTML[i] = '"' then
begin
inc(i); // skip the initial '"'
while AHTML[i] <> '"' do
begin
value := value + AHTML[i];
inc(i);
end;
inc(i); // skip the final '"'
end else
while not (AHTML[i] in [' ', '>', '/']) do
begin
value := value + AHTML[i];
inc(i);
end;
Add(TsHTMLAttr.Create(lowercase(trim(nam)), trim(value)));
nam := '';
end;
' ', '/', '>': ;
else nam := nam + AHTML[i];
end;
inc(i);
end;
i := IndexOfName('style');
if i > -1 then ParseStyle(Items[i].Value);
end;
{ AStyle is the value part of a 'style="...."' HTML string. Splits the into
individual records at the semicolons (;) and into name-value pairs at the
colon (:). Adds the name-value pairs to the list. }
procedure TsHTMLAttrList.ParseStyle(AStyle: String);
var
i, len: Integer;
value, nam: String;
begin
i := 1;
len := Length(AStyle);
// skip white space
while (i <= len) and (AStyle[i] = ' ') do inc(i);
// iterate through string
nam := '';
while (i <= len) do
begin
case AStyle[i] of
':': begin // name-value separator
inc(i); // skip ':' ...
while (i <= len) and (AStyle[i] = ' ') do inc(i); // ... and white space
value := '';
while (i <= len) and (AStyle[i] <> ';') do
begin
value := value + AStyle[i];
inc(i);
end;
Add(TsHTMLAttr.Create(lowercase(trim(nam)), UnquoteStr(trim(value))));
nam := '';
end;
' ': ; // skip white space
else nam := nam + AStyle[i];
end;
inc(i);
end;
end;
procedure TsHTMLAttrList.SetItem(AIndex: Integer; AValue: TsHTMLAttr);
begin
inherited SetItem(AIndex, AValue);
end;
{==============================================================================}
{ HTML-to-Rich-text conversion }
{==============================================================================}
type
TsHTMLAnalyzer = class(THTMLParser)
private
FWorkbook: TsWorkbook;
FPlainText: String;
FRichTextParams: TsRichTextParams;
FAttrList: TsHTMLAttrList;
FFontStack: TsIntegerStack;
FCurrFont: TsFont;
FPointSeparatorSettings: TFormatSettings;
FPreserveSpaces: Boolean;
function AddFont(AFont: TsFont): Integer;
procedure AddRichTextParam(AFont: TsFont; AHyperlinkIndex: Integer = -1);
procedure ProcessFontRestore;
procedure ReadFont(AFont: TsFont);
procedure TagFoundHandler(NoCaseTag, ActualTag: string);
procedure TextFoundHandler(AText: string);
public
constructor Create(AWorkbook: TsWorkbook; AFont: TsFont; AText: String);
destructor Destroy; override;
property PlainText: String read FPlainText;
property RichTextParams: TsRichTextParams read FRichTextParams;
property PreserveSpaces: Boolean read FPreserveSpaces write FPreserveSpaces;
end;
constructor TsHTMLAnalyzer.Create(AWorkbook: TsWorkbook; AFont: TsFont;
AText: String);
begin
if AWorkbook = nil then
raise Exception.Create('[TsHTMLAnalyzer.Create] Workbook required.');
if AFont = nil then
raise Exception.Create('[TsHTMLAnalyzer.Create] Font required.');
inherited Create(AText);
FWorkbook := AWorkbook;
OnFoundTag := @TagFoundHandler;
OnFoundText := @TextFoundHandler;
FPlainText := '';
SetLength(FRichTextParams, 0);
FAttrList := TsHTMLAttrList.Create;
FCurrFont := TsFont.Create;
FCurrFont.CopyOf(AFont);
FFontStack := TsIntegerStack.Create;
FPointSeparatorSettings := DefaultFormatSettings;
FPointSeparatorSettings.DecimalSeparator := '.';
end;
destructor TsHTMLAnalyzer.Destroy;
begin
FreeAndNil(FFontStack);
FreeAndNil(FCurrFont);
FreeAndNil(FAttrList);
inherited Destroy;
end;
{ Stores a font in the workbook's font list. Does not allow duplicates. }
function TsHTMLAnalyzer.AddFont(AFont: TsFont): Integer;
var
fnt: TsFont;
begin
// Is the font already stored in the workbook's font list?
Result := FWorkbook.FindFont(AFont.FontName, AFont.Size, AFont.Style, AFont.Color, AFont.Position);
if Result = -1 then
begin
// No. Create a new font, add it to the list, and return the new index.
fnt := TsFont.Create;
fnt.CopyOf(AFont);
Result := FWorkbook.AddFont(fnt);
end;
end;
procedure TsHTMLAnalyzer.AddRichTextParam(AFont: TsFont;
AHyperlinkIndex: Integer = -1);
var
len: Integer;
fntIndex: Integer;
n: Integer;
begin
n := Length(FRichTextParams);
len := UTF8Length(FPlainText);
fntIndex := AddFont(AFont);
if (n > 0) and (FRichTextparams[n-1].FirstIndex = len+1) then
begin
// Avoid adding another rich-text parameter for the same text location:
// Update the previous one
FRichTextParams[n-1].FontIndex := fntIndex;
FRichTextParams[n-1].HyperlinkIndex := AHyperlinkIndex;
end else
begin
// Add a new rich-text parameter
SetLength(FRichTextParams, n+1);
FRichTextParams[n].FirstIndex := len + 1;
FRichTextParams[n].FontIndex := fntIndex;
FRichTextParams[n].HyperlinkIndex := AHyperlinkIndex;
end;
end;
procedure TsHTMLAnalyzer.ProcessFontRestore;
var
fntIndex: Integer;
begin
fntIndex := FFontStack.Pop;
if fntIndex > -1 then
begin
FCurrFont.CopyOf(FWorkbook.GetFont(fntIndex));
AddRichTextParam(FCurrFont);
end;
end;
procedure TsHTMLAnalyzer.ReadFont(AFont: TsFont);
const
FACTOR = 1.2;
MIN_FONTSIZE = 6;
var
idx: Integer;
L: TStringList;
i, ip, im: Integer;
s: String;
f: Double;
defFntSize: Single;
begin
idx := FAttrList.IndexOfName('font-family'); // style tag
if idx = -1 then
idx := FAttrList.IndexOfName('face'); // html tag
if idx > -1 then begin
L := TStringList.Create;
try
L.StrictDelimiter := true;
L.DelimitedText := FAttrList[idx].Value;
AFont.FontName := L[0];
finally
L.Free;
end;
end;
idx := FAttrList.IndexOfName('font-size');
if idx = -1 then
idx := FAttrList.IndexOfName('size');
if idx > -1 then begin
defFntSize := FWorkbook.GetDefaultFont.Size;
s := FAttrList[idx].Value;
case s of
'medium', '3' : AFont.Size := defFntSize;
'large', '4' : AFont.Size := defFntSize*FACTOR;
'x-large', '5' : AFont.Size := defFntSize*FACTOR*FACTOR;
'xx-large', '6' : AFont.Size := defFntSize*FACTOR*FACTOR*FACTOR;
'small', '2' : AFont.Size := Max(MIN_FONTSIZE, defFntSize/FACTOR);
'x-small' : AFont.Size := Max(MIN_FONTSIZE, defFntSize/FACTOR/FACTOR);
'xx-small', '1' : AFont.Size := Max(MIN_FONTSIZE, defFntSize/FACTOR/FACTOR/FACTOR);
'larger' : AFont.Size := AFont.Size * FACTOR;
'smaller' : AFont.Size := Max(MIN_FONTSIZE, AFont.Size / FACTOR);
else
if s[1] in ['+', '-'] then
begin
TryStrToInt(s, i);
AFont.Size := defFntSize * IntPower(FACTOR, i);
end else
begin
i := 0;
im := 0;
ip := pos('%', s);
if ip = 0 then begin
im := pos('rem', s);
if im = 0 then
im := pos('em', s);
end;
if (ip > 0) then i := ip else
if (im > 0) then i := im;
if i > 0 then
begin
s := copy(s, 1, i-1);
if TryStrToFloat(s, f, FPointSeparatorSettings) then
begin
if ip > 0 then f := f * 0.01;
AFont.Size := Max(MIN_FONTSIZE, abs(f) * defFntSize);
end;
end else
AFont.Size := Max(MIN_FONTSIZE, HTMLLengthStrToPts(s));
end;
end;
end;
idx := FAttrList.IndexOfName('font-style');
if idx > -1 then
case FAttrList[idx].Value of
'normal' : Exclude(AFont.Style, fssItalic);
'italic' : Include(AFont.Style, fssItalic);
'oblique' : Include(AFont.Style, fssItalic);
end;
idx := FAttrList.IndexOfName('font-weight');
if idx > -1 then
begin
s := FAttrList[idx].Value;
if TryStrToInt(s, i) and (i >= 700) then Include(AFont.Style, fssBold);
end;
idx := FAttrList.IndexOfName('text-decoration');
if idx > -1 then
begin
s := FAttrList[idx].Value;
if pos('underline', s) <> 0 then Include(AFont.Style, fssUnderline);
if pos('line-through', s) <> 0 then Include(AFont.Style, fssStrikeout);
end;
idx := FAttrList.IndexOfName('color');
if idx > -1 then
AFont.Color := HTMLColorStrToColor(FAttrList[idx].Value);
end;
procedure TsHTMLAnalyzer.TagFoundHandler(NoCaseTag, ActualTag: String);
begin
case NoCaseTag[2] of
'B': case NoCaseTag of
'<B>' : begin
FFontStack.Push(AddFont(FCurrFont));
Include(FCurrFont.Style, fssBold);
AddRichTextParam(FCurrFont);
end;
'<BR>',
'<BR/>': FPlainText := FPlainText + FPS_LINE_ENDING;
else if (pos('<BR ', NoCaseTag) = 1) then
FPlainText := FPlainText + FPS_LINE_ENDING;
end;
'D': if (NoCaseTag = '<DEL>') then
begin
FFontStack.Push(AddFont(FCurrFont));
Include(FCurrFont.Style, fssStrikeout);
AddRichTextParam(FCurrFont);
end;
'E': if (NoCaseTag = '<EM>') then
begin
FFontStack.Push(AddFont(FCurrFont));
Include(FCurrFont.Style, fssItalic);
AddRichTextParam(FCurrFont);
end;
'F': if (pos('<FONT ', NoCaseTag) = 1) then
begin
FFontStack.Push(AddFont(FCurrFont));
FAttrList.Parse(ActualTag);
ReadFont(FCurrFont);
AddRichTextparam(FCurrFont);
end;
'I': case NoCaseTag of
'<I>' : begin
FFontStack.Push(AddFont(FCurrFont));
Include(FCurrFont.Style, fssItalic);
AddRichTextParam(FCurrFont);
end;
'<INS>': begin
FFontStack.Push(AddFont(FCurrFont));
Include(FCurrFont.Style, fssUnderline);
AddRichTextParam(FCurrFont);
end;
end;
'S': case NoCaseTag of
'<S>' : begin
FFontStack.Push(AddFont(FCurrFont));
Include(FCurrFont.Style, fssStrikeout);
AddRichTextParam(FCurrFont);
end;
'<STRONG>':begin
FFontStack.Push(AddFont(FCurrFont));
Include(FCurrFont.Style, fssBold);
AddRichTextParam(FCurrFont);
end;
'<SUB>': begin
FFontStack.Push(AddFont(FCurrFont));
FCurrFont.Position := fpSubscript;
AddRichTextParam(FCurrFont);
end;
'<SUP>': begin
FFontStack.Push(AddFont(FCurrFont));
FCurrFont.Position := fpSuperscript;
AddRichTextParam(FCurrFont);
end;
end;
'U': if (NoCaseTag = '<U>') then
begin
FFontStack.Push(AddFont(FCurrFont));
Include(FCurrFont.Style, fssUnderline);
AddRichTextParam(FCurrFont);
end;
'/': case NoCaseTag[3] of
'B': if (NoCaseTag) = '</B>' then ProcessFontRestore;
'D': if (NoCaseTag) = '</DEL>' then ProcessFontRestore;
'E': if (NoCaseTag) = '</EM>' then ProcessFontRestore;
'F': if (NoCaseTag) = '</FONT>' then ProcessFontRestore;
'I': if (NoCaseTag = '</I>') or (NoCaseTag = '</INS>') then ProcessFontRestore;
'S': if (NoCaseTag = '</S>') or (NoCaseTag = '</STRONG>') or
(NoCaseTag = '</SUB>') or (NoCaseTag = '</SUP>') then ProcessFontRestore;
'U': if (NoCaseTag = '</U>') then ProcessFontRestore;
end;
end;
end;
procedure TsHTMLAnalyzer.TextFoundHandler(AText: String);
begin
if not FPreserveSpaces then
AText := CleanHTMLString(AText) else
AText := RemoveHTMLEntities(AText);
if AText <> '' then
begin
if FPlainText = '' then
FPlainText := AText
else
FPlainText := FPlainText + AText;
end;
end;
{@@ ----------------------------------------------------------------------------
Extracts rich-text parameters out of an html-formatted string and returns the
plain text
-------------------------------------------------------------------------------}
procedure HTMLToRichText(AWorkbook: TsWorkbook; AFont: TsFont;
const AHTMLText: String; out APlainText: String;
out ARichTextParams: TsRichTextParams);
var
analyzer: TsHTMLAnalyzer;
j: Integer;
begin
analyzer := TsHTMLAnalyzer.Create(AWorkbook, AFont, AHTMLText + '<end>');
try
analyzer.PreserveSpaces := true;
analyzer.Exec;
APlainText := analyzer.PlainText;
SetLength(ARichTextParams, Length(analyzer.RichTextParams));
for j:=0 to High(ARichTextParams) do
ARichTextParams[j] := analyzer.RichTextParams[j];
finally
analyzer.Free;
end;
end;
{==============================================================================}
{ Rich-text-to-HTML conversion }
{==============================================================================}
type
TsHTMLComposer = class
private
FPointSeparatorSettings: TFormatSettings;
FWorkbook: TsWorkbook;
FBaseFont: TsFont;
FPlainText: String;
FRichTextParams: TsRichTextParams;
FPrefix: String;
FTagCase: TsTagCase;
procedure GetFontsFromWorkbook(out AFonts: TsFontArray);
function GetTextOfRichTextParam(AIndex: Integer): String;
protected
function FixTagCase(ATag: String): String;
public
constructor Create(AWorkbook: TsWorkbook; AFont: TsFont; APrefix: String = '';
ATagCase: TsTagCase = tcLowercase);
function Exec(const APlainText: String; const ARichTextParams: TsRichTextParams): String;
end;
constructor TsHTMLComposer.Create(AWorkbook: TsWorkbook; AFont: TsFont;
APrefix: String = ''; ATagCase: TsTagCase = tcLowercase);
begin
FPointSeparatorSettings := DefaultFormatSettings;
FPointSeparatorSettings.DecimalSeparator := '.';
FWorkbook := AWorkbook;
FBaseFont := AFont;
FPrefix := APrefix;
FTagCase := ATagCase;
end;
function TsHTMLComposer.Exec(const APlainText: String;
const ARichTextParams: TsRichTextParams): String;
type
TChangeFlag = (cfFontName, cfFontSize, cfFontColor);
const
EPS = 1E-3;
var
i: Integer;
prevFnt, currFnt: TsFont;
chgFlags: set of TChangeFlag;
openingTag, closingTag: String;
fonts: TsFontArray;
tag: String;
begin
if Length(ARichTextParams) = 0 then
begin
Result := FPlainText;
exit;
end;
FRichTextParams := ARichTextParams;
FPlainText := APlainText;
prevFnt := TsFont.Create;
prevFnt.CopyOf(FBaseFont);
if FRichTextParams[0].FirstIndex > 1 then
Result := GetTextOfRichTextParam(-1) else
Result := '';
GetFontsFromWorkbook(fonts);
for i:=0 to High(FRichTextParams) do
begin
currFnt := fonts[i];
openingTag := '';
closingTag := '';
if not SameFont(currFnt, prevFnt) then
begin
chgFlags := [];
if not SameText(prevFnt.FontName, currFnt.FontName) then
Include(chgFlags, cfFontName);
if not SameValue(currFnt.Size, prevFnt.Size, EPS) then
Include(chgFlags, cfFontSize);
if currFnt.Color <> prevFnt.Color then
Include(chgFlags, cfFontColor);
if [cfFontName, cfFontSize, cfFontColor] * chgFlags <> [] then
begin
tag := FixTagCase('font');
openingTag := '<' + tag;
if cfFontName in chgFlags then
begin
openingTag := openingTag + ' ' + FPrefix + FixTagCase('face') +
'="' + UnquoteStr(currFnt.FontName) + '"';
prevFnt.FontName := currFnt.FontName;
end;
if cfFontSize in chgFlags then
begin
openingTag := openingTag + ' ' + FPrefix + FixTagCase('size') +
'="' + Format('%.gpt', [currFnt.Size], FPointSeparatorSettings) + '"';
prevFnt.Size := currFnt.Size;
end;
if cfFontColor in chgFlags then
begin
openingTag := openingTag + ' ' + FPrefix + FixTagCase('color') +
'="' + ColorToHTMLColorStr(currFnt.Color) + '"';
prevFnt.Size := currFnt.Color;
end;
openingTag := openingTag + '>';
closingTag :='</' + tag + '>' + closingTag;
end;
if (fssBold in currFnt.Style) then
begin
tag := FixTagCase('b');
openingTag := openingTag + '<' + tag + '>';
closingTag := '</' + tag + '>' + closingTag;
prevFnt.Style := prevFnt.Style + [fssBold];
end else
prevFnt.Style := prevFnt.Style - [fssBold];;
if (fssItalic in currFnt.Style) then
begin
tag := FixTagCase('i');
openingTag := openingTag + '<' + tag + '>';
closingTag := '</' + tag + '>' + closingTag;
prevFnt.Style := prevFnt.Style + [fssItalic];
end else
prevFnt.Style := prevFnt.Style - [fssItalic];
if (fssUnderline in currFnt.Style) then
begin
tag := FixTagCase('u');
openingTag := openingTag + '<' + tag + '>';
closingTag := '</' + tag + '>' + closingTag;
prevFnt.Style := prevFnt.Style + [fssUnderline];
end else
prevFnt.Style := prevFnt.Style - [fssUnderline];
if (fssStrikeout in currFnt.Style) then
begin
tag := FixTagCase('s');
openingTag := openingTag + '<' + tag + '>';
closingTag := '</' + tag + '>' + closingTag;
prevFnt.Style := prevFnt.Style + [fssStrikeout];
end else
prevFnt.Style := prevFnt.Style - [fssStrikeout];
if currFnt.Position <> prevFnt.Position then
begin
if currFnt.Position = fpSuperscript then
begin
tag := FixTagCase('sup');
openingTag := openingTag + '<' + tag + '>';
closingTag := '</' + tag + '>' + closingTag;
currFnt.Position := fpSuperscript;
end else
if currFnt.Position = fpSubscript then
begin
tag := FixTagCase('sub');
openingTag := openingTag + '<' + tag + '>';
closingTag := '</' + tag + '>' + closingTag;
currFnt.Position := fpSubscript;
end else
currFnt.Position := fpNormal;
end;
end;
// Add the node text with opening and closing tags (reverse order as opening!)
Result := Result + openingTag + GetTextOfRichTextParam(i) + closingTag;
end; // for
end;
function TsHTMLComposer.FixTagCase(ATag: String): String;
begin
case FTagCase of
tcLowercase:
Result := Lowercase(ATag);
tcUppercase:
Result := Uppercase(ATag);
tcProperCase:
begin
Result := Lowercase(ATag);
Result[1] := UpCase(Result[1]);
end;
end;
end;
procedure TsHTMLComposer.GetFontsFromWorkbook(out AFonts: TsFontArray);
var
i: Integer;
begin
SetLength(AFonts, Length(FRichTextParams));
for i:=0 to High(AFonts) do
AFonts[i] := FWorkbook.GetFont(FRichTextParams[i].FontIndex);
end;
function TsHTMLComposer.GetTextOfRichTextParam(AIndex: Integer): String;
var
p1, p2: Integer;
begin
if AIndex = -1 then
Result := UTF8Copy(FPlainText, 1, FRichTextParams[0].FirstIndex-1)
else
if AIndex <= High(FRichTextParams) then
begin
p1 := FRichTextParams[AIndex].FirstIndex;
if AIndex < High(FRichTextparams) then
p2 := FRichTextParams[AIndex+1].FirstIndex else
p2 := UTF8Length(FPlainText) + 1;
Result := UTF8Copy(FPlaiNText, p1, p2-p1);
end else
Result := '';
end;
{@@ ----------------------------------------------------------------------------
Constructs a html-coded string from a plain text string and
rich-text parameters
-------------------------------------------------------------------------------}
procedure RichTextToHTML(AWorkbook: TsWorkbook; AFont: TsFont;
const APlainText: String; const ARichTextParams: TsRichTextParams;
out AHTMLText: String; APrefix: String = ''; ATagCase: TsTagCase = tcLowercase);
var
composer: TsHTMLComposer;
begin
if Length(ARichTextParams) = 0 then
AHTMLText := APlainText
else
begin
composer := TsHTMLComposer.Create(AWorkbook, AFont, APrefix, ATagCase);
try
AHTMLText := composer.Exec(APlainText, ARichTextParams);
finally
composer.Free;
end;
end;
end;
end.